require(data.table)
require(lubridate)
require(zoo)
require(forecast)
data_path = "/Users/onurcanaydin/Desktop/360 proje/ProjectRawData.csv"
data <- fread(data_path)

#TRENDYOLMILLA TAYT

##Introduction and Model Construction

After filtering our whole data for Trendmilla Tayt,we plot graph for amount that has been sold.

maproducts <- data[data$product_content_id == 31515569]
require(ggplot2)
ggplot(maproducts, aes(x = event_date, y = sold_count)) + geom_line()

That plot has wide variance, which urges us to take logarithm of sale data.Moreover, we add day,month ,trend data for further process.Then,we plot log of sold products over time to see if we decrease variance or not.

maproducts[, `:=`(month, month(event_date))]
maproducts[, `:=`(day, lubridate::wday(event_date))]

maproducts[, `:=`(trend, 1:.N)]
maproducts[, `:=`(log_sold, log(sold_count))]
ggplot(maproducts, aes(x = event_date, y = log_sold)) + geom_line()

Variance of log of sold products seems more stable.Therefore,we start building linear regression models with logarithmic data.

lm_model <- lm(maproducts, formula = log_sold ~ month + day + 
    price + visit_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + day + price + visit_count, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.23155 -0.38023 -0.02241  0.31263  2.61907 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.518e+00  3.982e-01  18.879  < 2e-16 ***
## month        6.895e-02  1.336e-02   5.160 4.06e-07 ***
## day         -5.625e-03  1.838e-02  -0.306     0.76    
## price       -3.533e-02  7.184e-03  -4.918 1.32e-06 ***
## visit_count  1.660e-05  2.339e-06   7.096 6.66e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7084 on 367 degrees of freedom
## Multiple R-squared:  0.1824, Adjusted R-squared:  0.1734 
## F-statistic: 20.46 on 4 and 367 DF,  p-value: 3.11e-15
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.24229 -0.37555 -0.02929  0.31191  2.63026 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.491e+00  3.877e-01  19.320  < 2e-16 ***
## month        6.904e-02  1.334e-02   5.175 3.75e-07 ***
## price       -3.525e-02  7.170e-03  -4.917 1.33e-06 ***
## visit_count  1.661e-05  2.336e-06   7.109 6.13e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7075 on 368 degrees of freedom
## Multiple R-squared:  0.1821, Adjusted R-squared:  0.1755 
## F-statistic: 27.32 on 3 and 368 DF,  p-value: 5.666e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.53430 -0.43260 -0.03911  0.31180  2.38794 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.209e+01  6.651e-01  18.173  < 2e-16 ***
## month        5.710e-02  1.237e-02   4.616 5.42e-06 ***
## price       -1.052e-01  1.079e-02  -9.746  < 2e-16 ***
## visit_count  6.942e-06  2.453e-06   2.830  0.00492 ** 
## trend       -5.281e-03  6.447e-04  -8.191 4.36e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6514 on 367 degrees of freedom
## Multiple R-squared:  0.3085, Adjusted R-squared:  0.301 
## F-statistic: 40.94 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.62935 -0.43413 -0.03755  0.32096  2.52811 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.117e+01  6.648e-01  16.809  < 2e-16 ***
## month          5.724e-02  1.194e-02   4.794 2.38e-06 ***
## price         -9.338e-02  1.065e-02  -8.767  < 2e-16 ***
## visit_count    3.305e-07  2.678e-06   0.123    0.902    
## trend         -4.101e-03  6.611e-04  -6.203 1.49e-09 ***
## favored_count  1.511e-04  2.860e-05   5.285 2.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6288 on 366 degrees of freedom
## Multiple R-squared:  0.3576, Adjusted R-squared:  0.3488 
## F-statistic: 40.74 on 5 and 366 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.55356 -0.22639  0.04185  0.28827  0.96790 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.945e+00  5.009e-01  15.861  < 2e-16 ***
## month          1.648e-02  8.732e-03   1.887  0.05993 .  
## price         -4.214e-02  8.018e-03  -5.256 2.51e-07 ***
## visit_count   -8.494e-07  1.900e-06  -0.447  0.65508    
## trend         -9.410e-04  4.972e-04  -1.892  0.05923 .  
## favored_count  6.540e-05  2.077e-05   3.149  0.00177 ** 
## category_sold  2.748e-04  1.442e-05  19.052  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4458 on 365 degrees of freedom
## Multiple R-squared:  0.6779, Adjusted R-squared:  0.6726 
## F-statistic:   128 on 6 and 365 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + favored_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + favored_count, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.46608 -0.33154 -0.05262  0.34463  2.75039 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.650e+00  3.624e-01  21.112  < 2e-16 ***
## month          6.551e-02  1.246e-02   5.259 2.46e-07 ***
## price         -4.060e-02  6.728e-03  -6.035 3.90e-09 ***
## visit_count    4.361e-06  2.727e-06   1.599    0.111    
## favored_count  2.110e-04  2.826e-05   7.468 6.00e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6601 on 367 degrees of freedom
## Multiple R-squared:   0.29,  Adjusted R-squared:  0.2823 
## F-statistic: 37.48 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    trend + favored_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + trend + favored_count, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.63705 -0.43319 -0.03951  0.32334  2.52880 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.119e+01  6.494e-01  17.235  < 2e-16 ***
## month          5.668e-02  1.104e-02   5.134 4.62e-07 ***
## price         -9.355e-02  1.055e-02  -8.868  < 2e-16 ***
## trend         -4.121e-03  6.405e-04  -6.434 3.89e-10 ***
## favored_count  1.528e-04  2.525e-05   6.051 3.56e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6279 on 367 degrees of freedom
## Multiple R-squared:  0.3576, Adjusted R-squared:  0.3505 
## F-statistic: 51.06 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold + category_brand_sold + 
    as.factor(day))
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold + category_brand_sold + as.factor(day), 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.41717 -0.23441  0.00487  0.29603  0.92353 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.550e+00  4.879e-01  15.475  < 2e-16 ***
## month                3.861e-02  9.035e-03   4.273 2.48e-05 ***
## price               -5.041e-02  7.778e-03  -6.481 3.02e-10 ***
## visit_count          5.051e-07  1.831e-06   0.276   0.7828    
## trend                7.059e-04  5.474e-04   1.290   0.1980    
## favored_count       -6.297e-05  2.857e-05  -2.204   0.0281 *  
## category_sold        2.939e-04  1.433e-05  20.506  < 2e-16 ***
## category_brand_sold  1.104e-05  1.752e-06   6.304 8.53e-10 ***
## as.factor(day)2      5.223e-02  8.247e-02   0.633   0.5270    
## as.factor(day)3      1.648e-01  8.284e-02   1.989   0.0475 *  
## as.factor(day)4      1.036e-01  8.296e-02   1.248   0.2127    
## as.factor(day)5      9.351e-02  8.314e-02   1.125   0.2615    
## as.factor(day)6      5.406e-02  8.289e-02   0.652   0.5147    
## as.factor(day)7      1.303e-02  8.250e-02   0.158   0.8746    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4242 on 358 degrees of freedom
## Multiple R-squared:  0.714,  Adjusted R-squared:  0.7036 
## F-statistic: 68.73 on 13 and 358 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold + category_brand_sold)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold + category_brand_sold, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.42445 -0.24290  0.00981  0.28835  0.95183 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.581e+00  4.799e-01  15.796  < 2e-16 ***
## month                3.852e-02  9.013e-03   4.274 2.46e-05 ***
## price               -4.991e-02  7.724e-03  -6.461 3.34e-10 ***
## visit_count          7.677e-07  1.825e-06   0.421   0.6742    
## trend                7.495e-04  5.439e-04   1.378   0.1690    
## favored_count       -6.286e-05  2.838e-05  -2.214   0.0274 *  
## category_sold        2.974e-04  1.418e-05  20.974  < 2e-16 ***
## category_brand_sold  1.094e-05  1.738e-06   6.292 9.00e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.424 on 364 degrees of freedom
## Multiple R-squared:  0.7095, Adjusted R-squared:  0.7039 
## F-statistic:   127 on 7 and 364 DF,  p-value: < 2.2e-16

After evaluation of different models parameters of which is included in data, we choose model formula of which is ‘log_sold ~ month + price + visit_count + trend + favored_count + category_sold + category_brand_sold’. Furhermore,lag variables could violate our model.For that reason, we add lag1 and lag2 variables to model and examine their effectiveness.

maproducts[, `:=`(lag_1, NA)]
maproducts$lag_1[2:372] <- maproducts$log_sold[1:371]
maproducts[, `:=`(lag_2, NA)]
maproducts$lag_2[3:372] <- maproducts$log_sold[1:370]
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold + lag_1)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold + lag_1, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78466 -0.17543  0.02087  0.18797  0.98322 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.100e+00  4.621e-01   8.872  < 2e-16 ***
## month          7.673e-03  6.793e-03   1.130  0.25942    
## price         -2.240e-02  6.348e-03  -3.529  0.00047 ***
## visit_count    3.753e-07  1.476e-06   0.254  0.79941    
## trend         -5.040e-04  3.866e-04  -1.304  0.19309    
## favored_count  1.433e-05  1.643e-05   0.872  0.38361    
## category_sold  1.753e-04  1.368e-05  12.819  < 2e-16 ***
## lag_1          4.837e-01  3.277e-02  14.761  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3456 on 363 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8074, Adjusted R-squared:  0.8037 
## F-statistic: 217.4 on 7 and 363 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + category_sold + lag_1)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     category_sold + lag_1, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.77848 -0.17684  0.02261  0.18454  0.98184 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.111e+00  4.618e-01   8.903  < 2e-16 ***
## month          7.256e-03  6.774e-03   1.071 0.284761    
## price         -2.282e-02  6.327e-03  -3.607 0.000352 ***
## visit_count    9.547e-07  1.318e-06   0.725 0.469206    
## trend         -5.779e-04  3.771e-04  -1.533 0.126229    
## category_sold  1.762e-04  1.363e-05  12.925  < 2e-16 ***
## lag_1          4.892e-01  3.213e-02  15.224  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3455 on 364 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.807,  Adjusted R-squared:  0.8039 
## F-statistic: 253.7 on 6 and 364 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    trend + category_sold + lag_1 + lag_2)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold + 
##     lag_1 + lag_2, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78302 -0.17611  0.01115  0.18232  0.90548 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.255e+00  4.322e-01   9.844  < 2e-16 ***
## month          7.376e-03  5.836e-03   1.264 0.207059    
## price         -2.129e-02  6.003e-03  -3.546 0.000442 ***
## trend         -6.655e-04  3.378e-04  -1.970 0.049589 *  
## category_sold  1.809e-04  1.365e-05  13.248  < 2e-16 ***
## lag_1          6.085e-01  4.862e-02  12.516  < 2e-16 ***
## lag_2         -1.503e-01  4.040e-02  -3.720 0.000230 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.331 on 363 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.823,  Adjusted R-squared:  0.8201 
## F-statistic: 281.3 on 6 and 363 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    trend + category_sold + lag_1 + lag_2 + as.factor(day))
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold + 
##     lag_1 + lag_2 + as.factor(day), data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78906 -0.16743  0.01298  0.18226  0.92104 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.052e+00  4.420e-01   9.168  < 2e-16 ***
## month            7.346e-03  5.798e-03   1.267 0.205974    
## price           -1.985e-02  6.010e-03  -3.303 0.001053 ** 
## trend           -6.125e-04  3.375e-04  -1.815 0.070395 .  
## category_sold    1.743e-04  1.374e-05  12.689  < 2e-16 ***
## lag_1            6.255e-01  4.965e-02  12.598  < 2e-16 ***
## lag_2           -1.481e-01  4.155e-02  -3.564 0.000414 ***
## as.factor(day)2 -8.424e-02  6.466e-02  -1.303 0.193527    
## as.factor(day)3  3.685e-02  6.487e-02   0.568 0.570368    
## as.factor(day)4 -8.666e-03  6.673e-02  -0.130 0.896750    
## as.factor(day)5  1.244e-01  6.586e-02   1.889 0.059664 .  
## as.factor(day)6  7.586e-03  6.496e-02   0.117 0.907112    
## as.factor(day)7  2.645e-03  6.449e-02   0.041 0.967309    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3288 on 357 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8282, Adjusted R-squared:  0.8225 
## F-statistic: 143.4 on 12 and 357 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    trend + category_sold + lag_1 + lag_2)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold + 
##     lag_1 + lag_2, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78302 -0.17611  0.01115  0.18232  0.90548 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.255e+00  4.322e-01   9.844  < 2e-16 ***
## month          7.376e-03  5.836e-03   1.264 0.207059    
## price         -2.129e-02  6.003e-03  -3.546 0.000442 ***
## trend         -6.655e-04  3.378e-04  -1.970 0.049589 *  
## category_sold  1.809e-04  1.365e-05  13.248  < 2e-16 ***
## lag_1          6.085e-01  4.862e-02  12.516  < 2e-16 ***
## lag_2         -1.503e-01  4.040e-02  -3.720 0.000230 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.331 on 363 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.823,  Adjusted R-squared:  0.8201 
## F-statistic: 281.3 on 6 and 363 DF,  p-value: < 2.2e-16

That modification gives us better model with respect to r-squared value.Our new best model has formula that is ‘log_sold ~ month + price + trend + category_sold + lag_1 + lag_2’.

As another improving step, we could take month variable as factor variable.

lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + visit_count + trend + favored_count + category_sold + 
    lag_1 + lag_2)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + visit_count + 
##     trend + favored_count + category_sold + lag_1 + lag_2, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78360 -0.14550 -0.00289  0.14253  1.08028 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.071e+00  5.701e-01   8.894  < 2e-16 ***
## as.factor(month)2  -1.359e-02  1.143e-01  -0.119  0.90545    
## as.factor(month)3   3.768e-02  1.174e-01   0.321  0.74843    
## as.factor(month)4   1.321e-01  1.217e-01   1.085  0.27859    
## as.factor(month)5   1.118e-02  1.077e-01   0.104  0.91737    
## as.factor(month)6  -2.378e-02  1.319e-01  -0.180  0.85707    
## as.factor(month)7  -3.747e-01  1.216e-01  -3.082  0.00222 ** 
## as.factor(month)8  -3.626e-01  1.137e-01  -3.191  0.00155 ** 
## as.factor(month)9  -8.878e-02  1.060e-01  -0.838  0.40268    
## as.factor(month)10  9.986e-02  1.042e-01   0.958  0.33874    
## as.factor(month)11 -6.428e-02  1.081e-01  -0.595  0.55235    
## as.factor(month)12  1.721e-01  8.356e-02   2.060  0.04017 *  
## price              -2.749e-02  9.373e-03  -2.933  0.00358 ** 
## visit_count         1.091e-06  2.300e-06   0.474  0.63555    
## trend               9.213e-05  5.899e-04   0.156  0.87599    
## favored_count      -5.311e-06  2.455e-05  -0.216  0.82886    
## category_sold       1.902e-04  1.395e-05  13.635  < 2e-16 ***
## lag_1               5.437e-01  4.717e-02  11.526  < 2e-16 ***
## lag_2              -1.777e-01  3.977e-02  -4.469 1.06e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3142 on 351 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8458, Adjusted R-squared:  0.8379 
## F-statistic: 106.9 on 18 and 351 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + visit_count + trend + category_sold + lag_1 + lag_2)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + visit_count + 
##     trend + category_sold + lag_1 + lag_2, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78042 -0.14673 -0.00239  0.14477  1.08025 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.065e+00  5.687e-01   8.907  < 2e-16 ***
## as.factor(month)2  -2.796e-03  1.027e-01  -0.027 0.978298    
## as.factor(month)3   4.539e-02  1.117e-01   0.406 0.684722    
## as.factor(month)4   1.409e-01  1.146e-01   1.229 0.219821    
## as.factor(month)5   2.125e-02  9.701e-02   0.219 0.826778    
## as.factor(month)6  -1.411e-02  1.240e-01  -0.114 0.909446    
## as.factor(month)7  -3.659e-01  1.143e-01  -3.200 0.001499 ** 
## as.factor(month)8  -3.533e-01  1.051e-01  -3.363 0.000855 ***
## as.factor(month)9  -7.815e-02  9.376e-02  -0.834 0.405104    
## as.factor(month)10  1.122e-01  8.716e-02   1.287 0.198880    
## as.factor(month)11 -5.082e-02  8.824e-02  -0.576 0.565023    
## as.factor(month)12  1.681e-01  8.140e-02   2.065 0.039611 *  
## price              -2.740e-02  9.352e-03  -2.930 0.003608 ** 
## visit_count         7.825e-07  1.802e-06   0.434 0.664389    
## trend               1.007e-04  5.878e-04   0.171 0.864061    
## category_sold       1.898e-04  1.377e-05  13.778  < 2e-16 ***
## lag_1               5.431e-01  4.703e-02  11.548  < 2e-16 ***
## lag_2              -1.787e-01  3.945e-02  -4.530 8.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3138 on 352 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8457, Adjusted R-squared:  0.8383 
## F-statistic: 113.5 on 17 and 352 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + trend + category_sold + lag_1 + lag_2)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + category_sold + 
##     lag_1 + lag_2, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78021 -0.14134 -0.00437  0.14636  1.07965 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.123e+00  5.522e-01   9.277  < 2e-16 ***
## as.factor(month)2   2.237e-02  8.472e-02   0.264  0.79188    
## as.factor(month)3   7.785e-02  8.289e-02   0.939  0.34828    
## as.factor(month)4   1.646e-01  1.007e-01   1.635  0.10289    
## as.factor(month)5   3.557e-02  9.113e-02   0.390  0.69657    
## as.factor(month)6  -1.186e-03  1.202e-01  -0.010  0.99213    
## as.factor(month)7  -3.552e-01  1.116e-01  -3.184  0.00158 ** 
## as.factor(month)8  -3.447e-01  1.030e-01  -3.345  0.00091 ***
## as.factor(month)9  -7.363e-02  9.307e-02  -0.791  0.42943    
## as.factor(month)10  1.119e-01  8.706e-02   1.285  0.19954    
## as.factor(month)11 -5.567e-02  8.742e-02  -0.637  0.52467    
## as.factor(month)12  1.670e-01  8.126e-02   2.055  0.04064 *  
## price              -2.856e-02  8.954e-03  -3.190  0.00155 ** 
## trend               1.207e-05  5.506e-04   0.022  0.98252    
## category_sold       1.905e-04  1.364e-05  13.968  < 2e-16 ***
## lag_1               5.438e-01  4.695e-02  11.582  < 2e-16 ***
## lag_2              -1.775e-01  3.931e-02  -4.516  8.6e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3135 on 353 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8457, Adjusted R-squared:  0.8387 
## F-statistic: 120.9 on 16 and 353 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + category_sold + lag_1 + lag_2)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold + 
##     lag_1 + lag_2, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78012 -0.14141 -0.00457  0.14608  1.07969 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.1311195  0.4000478  12.826  < 2e-16 ***
## as.factor(month)2   0.0218987  0.0818169   0.268 0.789121    
## as.factor(month)3   0.0776407  0.0822366   0.944 0.345755    
## as.factor(month)4   0.1649441  0.0992943   1.661 0.097566 .  
## as.factor(month)5   0.0360292  0.0885224   0.407 0.684249    
## as.factor(month)6   0.0006196  0.0874290   0.007 0.994350    
## as.factor(month)7  -0.3539073  0.0931808  -3.798 0.000172 ***
## as.factor(month)8  -0.3436076  0.0896331  -3.833 0.000149 ***
## as.factor(month)9  -0.0729030  0.0868755  -0.839 0.401943    
## as.factor(month)10  0.1122532  0.0853731   1.315 0.189410    
## as.factor(month)11 -0.0555272  0.0870522  -0.638 0.523977    
## as.factor(month)12  0.1672511  0.0801523   2.087 0.037633 *  
## price              -0.0287052  0.0061096  -4.698 3.76e-06 ***
## category_sold       0.0001905  0.0000129  14.762  < 2e-16 ***
## lag_1               0.5437774  0.0468828  11.599  < 2e-16 ***
## lag_2              -0.1774352  0.0390363  -4.545 7.54e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.313 on 354 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8457, Adjusted R-squared:  0.8391 
## F-statistic: 129.3 on 15 and 354 DF,  p-value: < 2.2e-16

That is quite developing iteration,too.From the first view of logarithm of sold products,we may predict outliers that breaks our models’ goodness.Let us put them into model.

maproducts[, `:=`(residuals, NA)]
maproducts$residuals[2:372] <- lm_model$residuals
## Warning in maproducts$residuals[2:372] <- lm_model$residuals: number of items to
## replace is not a multiple of replacement length
maproducts[!is.na(residuals), `:=`(quant5, quantile(residuals, 
    0.05))]
maproducts[!is.na(residuals), `:=`(quant95, quantile(residuals, 
    0.95))]
maproducts[, `:=`(outlier_small, as.numeric(residuals < quant5))]
maproducts[, `:=`(outlier_great, as.numeric(residuals > quant95))]

lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + category_sold + lag_1 + lag_2 + outlier_small + outlier_great)
summary(lm_model2)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold + 
##     lag_1 + lag_2 + outlier_small + outlier_great, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.75665 -0.14145 -0.00064  0.14662  0.87929 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.118e+00  3.980e-01  12.861  < 2e-16 ***
## as.factor(month)2   4.164e-02  8.071e-02   0.516 0.606239    
## as.factor(month)3   6.394e-02  8.119e-02   0.788 0.431461    
## as.factor(month)4   1.748e-01  9.829e-02   1.778 0.076221 .  
## as.factor(month)5   4.582e-02  8.796e-02   0.521 0.602790    
## as.factor(month)6   1.601e-02  8.624e-02   0.186 0.852845    
## as.factor(month)7  -3.603e-01  9.168e-02  -3.930 0.000102 ***
## as.factor(month)8  -3.596e-01  8.835e-02  -4.070 5.81e-05 ***
## as.factor(month)9  -7.967e-02  8.543e-02  -0.933 0.351684    
## as.factor(month)10  1.353e-01  8.433e-02   1.605 0.109405    
## as.factor(month)11 -6.754e-02  8.566e-02  -0.788 0.430964    
## as.factor(month)12  1.859e-01  7.914e-02   2.349 0.019354 *  
## price              -2.798e-02  6.162e-03  -4.540 7.73e-06 ***
## category_sold       1.946e-04  1.277e-05  15.244  < 2e-16 ***
## lag_1               5.203e-01  4.660e-02  11.166  < 2e-16 ***
## lag_2              -1.618e-01  3.864e-02  -4.187 3.58e-05 ***
## outlier_small       2.189e-02  7.986e-02   0.274 0.784130    
## outlier_great       2.842e-01  7.535e-02   3.772 0.000190 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3077 on 352 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8517, Adjusted R-squared:  0.8445 
## F-statistic: 118.9 on 17 and 352 DF,  p-value: < 2.2e-16
lm_model2_1 <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + category_sold + lag_1 + lag_2 + outlier_great)
summary(lm_model2)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold + 
##     lag_1 + lag_2 + outlier_small + outlier_great, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.75665 -0.14145 -0.00064  0.14662  0.87929 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.118e+00  3.980e-01  12.861  < 2e-16 ***
## as.factor(month)2   4.164e-02  8.071e-02   0.516 0.606239    
## as.factor(month)3   6.394e-02  8.119e-02   0.788 0.431461    
## as.factor(month)4   1.748e-01  9.829e-02   1.778 0.076221 .  
## as.factor(month)5   4.582e-02  8.796e-02   0.521 0.602790    
## as.factor(month)6   1.601e-02  8.624e-02   0.186 0.852845    
## as.factor(month)7  -3.603e-01  9.168e-02  -3.930 0.000102 ***
## as.factor(month)8  -3.596e-01  8.835e-02  -4.070 5.81e-05 ***
## as.factor(month)9  -7.967e-02  8.543e-02  -0.933 0.351684    
## as.factor(month)10  1.353e-01  8.433e-02   1.605 0.109405    
## as.factor(month)11 -6.754e-02  8.566e-02  -0.788 0.430964    
## as.factor(month)12  1.859e-01  7.914e-02   2.349 0.019354 *  
## price              -2.798e-02  6.162e-03  -4.540 7.73e-06 ***
## category_sold       1.946e-04  1.277e-05  15.244  < 2e-16 ***
## lag_1               5.203e-01  4.660e-02  11.166  < 2e-16 ***
## lag_2              -1.618e-01  3.864e-02  -4.187 3.58e-05 ***
## outlier_small       2.189e-02  7.986e-02   0.274 0.784130    
## outlier_great       2.842e-01  7.535e-02   3.772 0.000190 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3077 on 352 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8517, Adjusted R-squared:  0.8445 
## F-statistic: 118.9 on 17 and 352 DF,  p-value: < 2.2e-16

As we expected, outliers help our model to predict better. For another perspective AIC values of models.

require(ursa)
## Loading required package: ursa
AIC(lm_model)
## [1] 208.1339
AIC(lm_model2_1)
## [1] 195.5486

That proves claim of model with month as factor variable.

Now,It is time for us to come up with arima models.Firstly,we build time series with frequency=7.Plot that.

ts_sold <- ts(maproducts$log_sold, frequency = 7)
ts.plot(ts_sold)

Variance is stable enough thanks to taking logarithm of data.Then, decompose time series data with additive type. Plot decomposed data

sold_decomp <- decompose(ts_sold, type = "additive")
plot(sold_decomp)

Random part is kind of stationary except some outlier points. We shall observe autocorrelation and partial autocorrelation of random part.

random = sold_decomp$random
# random part looks really fine.
acf(sold_decomp$random, na.action = na.pass)

pacf(sold_decomp$random, na.action = na.pass)

As we observe from those graphs,we start building arima models with (p,d,q)=(2,0,0) and increase p value to 10 one by one.After this process,we find model with auto arima function. Pick the best model.

ar_model <- arima(random, order = c(2, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(2, 0, 0))
## 
## Coefficients:
##          ar1      ar2  intercept
##       0.5499  -0.3993    -0.0006
## s.e.  0.0485   0.0487     0.0189
## 
## sigma^2 estimated as 0.09456:  log likelihood = -87.97,  aic = 183.94
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -3.906212e-05 0.3074988 0.2143303 108.6211 244.7187 0.8054923
##                     ACF1
## Training set -0.04720534
ar_model <- arima(random, order = c(3, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(3, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3  intercept
##       0.5011  -0.3299  -0.1256    -0.0005
## s.e.  0.0523   0.0564   0.0526     0.0167
## 
## sigma^2 estimated as 0.09309:  log likelihood = -85.14,  aic = 180.29
## 
## Training set error measures:
##                      ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0001426 0.3051121 0.2120258 94.60449 227.2879 0.7968318
##                     ACF1
## Training set -0.02241973
ar_model <- arima(random, order = c(4, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(4, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4  intercept
##       0.4775  -0.3933  -0.0274  -0.1959    -0.0004
## s.e.  0.0517   0.0579   0.0578   0.0520     0.0138
## 
## sigma^2 estimated as 0.08958:  log likelihood = -78.18,  aic = 168.37
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0003721817 0.2993006 0.2059158 64.52858 206.3382 0.7738692
##                     ACF1
## Training set -0.04478536
ar_model <- arima(random, order = c(5, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(5, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5  intercept
##       0.4301  -0.3984  -0.1215  -0.0797  -0.2430    -0.0003
## s.e.  0.0511   0.0561   0.0595   0.0561   0.0513     0.0108
## 
## sigma^2 estimated as 0.08435:  log likelihood = -67.33,  aic = 148.66
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0005495861 0.2904369 0.2020511 84.01556 210.5383 0.7593449
##                    ACF1
## Training set -0.0234773
ar_model <- arima(random, order = c(6, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(6, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ar6  intercept
##       0.4051  -0.4071  -0.1336  -0.1210  -0.1977  -0.1049    -0.0003
## s.e.  0.0524   0.0560   0.0595   0.0595   0.0559   0.0526     0.0097
## 
## sigma^2 estimated as 0.08343:  log likelihood = -65.35,  aic = 146.7
## 
## Training set error measures:
##                         ME      RMSE      MAE      MPE     MAPE      MASE
## Training set -0.0005821933 0.2888445 0.202161 62.75557 213.4222 0.7597579
##                     ACF1
## Training set -0.01297692
ar_model <- arima(random, order = c(7, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(7, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ar6      ar7  intercept
##       0.3910  -0.4344  -0.1508  -0.1388  -0.2536  -0.0485  -0.1392    -0.0003
## s.e.  0.0521   0.0564   0.0593   0.0593   0.0592   0.0562   0.0523     0.0085
## 
## sigma^2 estimated as 0.08182:  log likelihood = -61.85,  aic = 141.69
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE   MAPE      MASE
## Training set -0.0006207775 0.2860402 0.2004398 61.87008 213.06 0.7532892
##                     ACF1
## Training set -0.01228718
ar_model <- arima(random, order = c(8, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(8, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ar6      ar7      ar8
##       0.3770  -0.4387  -0.1760  -0.1530  -0.2680  -0.0911  -0.1002  -0.0995
## s.e.  0.0524   0.0562   0.0605   0.0595   0.0594   0.0603   0.0560   0.0525
##       intercept
##         -0.0003
## s.e.     0.0077
## 
## sigma^2 estimated as 0.08101:  log likelihood = -60.06,  aic = 140.12
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0006786974 0.2846185 0.1999935 56.32226 211.7121 0.7516121
##                      ACF1
## Training set -0.006338126
ar_model9 <- arima(random, order = c(9, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(8, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ar6      ar7      ar8
##       0.3770  -0.4387  -0.1760  -0.1530  -0.2680  -0.0911  -0.1002  -0.0995
## s.e.  0.0524   0.0562   0.0605   0.0595   0.0594   0.0603   0.0560   0.0525
##       intercept
##         -0.0003
## s.e.     0.0077
## 
## sigma^2 estimated as 0.08101:  log likelihood = -60.06,  aic = 140.12
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0006786974 0.2846185 0.1999935 56.32226 211.7121 0.7516121
##                      ACF1
## Training set -0.006338126
ar_model <- arima(random, order = c(10, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(10, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ar6      ar7      ar8
##       0.3618  -0.4558  -0.1944  -0.1844  -0.3076  -0.1222  -0.1516  -0.1139
## s.e.  0.0524   0.0562   0.0605   0.0609   0.0613   0.0613   0.0607   0.0606
##           ar9     ar10  intercept
##       -0.0409  -0.0997    -0.0002
## s.e.   0.0567   0.0535     0.0065
## 
## sigma^2 estimated as 0.07972:  log likelihood = -57.21,  aic = 138.43
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0008592798 0.2823551 0.1980518 34.04849 224.5072 0.7443149
##                    ACF1
## Training set -0.0178424
auto.arima(random, seasonal = FALSE, trace = TRUE)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)           with non-zero mean : 91.79447
##  ARIMA(0,0,0)           with non-zero mean : 301.3164
##  ARIMA(1,0,0)           with non-zero mean : 244.3749
##  ARIMA(0,0,1)           with non-zero mean : 209.4558
##  ARIMA(0,0,0)           with zero mean     : 299.2956
##  ARIMA(1,0,2)           with non-zero mean : Inf
##  ARIMA(2,0,1)           with non-zero mean : 92.18419
##  ARIMA(3,0,2)           with non-zero mean : Inf
##  ARIMA(2,0,3)           with non-zero mean : 75.22619
##  ARIMA(1,0,3)           with non-zero mean : Inf
##  ARIMA(3,0,3)           with non-zero mean : Inf
##  ARIMA(2,0,4)           with non-zero mean : 64.74229
##  ARIMA(1,0,4)           with non-zero mean : Inf
##  ARIMA(3,0,4)           with non-zero mean : Inf
##  ARIMA(2,0,5)           with non-zero mean : 66.84559
##  ARIMA(1,0,5)           with non-zero mean : Inf
##  ARIMA(3,0,5)           with non-zero mean : Inf
##  ARIMA(2,0,4)           with zero mean     : 62.74181
##  ARIMA(1,0,4)           with zero mean     : 108.0237
##  ARIMA(2,0,3)           with zero mean     : 73.24936
##  ARIMA(3,0,4)           with zero mean     : Inf
##  ARIMA(2,0,5)           with zero mean     : 64.78566
##  ARIMA(1,0,3)           with zero mean     : 107.0578
##  ARIMA(1,0,5)           with zero mean     : 109.7789
##  ARIMA(3,0,3)           with zero mean     : Inf
##  ARIMA(3,0,5)           with zero mean     : Inf
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(2,0,4)           with zero mean     : Inf
##  ARIMA(2,0,4)           with non-zero mean : Inf
##  ARIMA(2,0,5)           with zero mean     : Inf
##  ARIMA(2,0,5)           with non-zero mean : Inf
##  ARIMA(2,0,3)           with zero mean     : Inf
##  ARIMA(2,0,3)           with non-zero mean : Inf
##  ARIMA(2,0,2)           with non-zero mean : Inf
##  ARIMA(2,0,1)           with non-zero mean : Inf
##  ARIMA(1,0,3)           with zero mean     : Inf
##  ARIMA(1,0,4)           with zero mean     : Inf
##  ARIMA(1,0,5)           with zero mean     : Inf
##  ARIMA(0,0,1)           with non-zero mean : 209.7616
## 
##  Best model: ARIMA(0,0,1)           with non-zero mean
## Series: random 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##          ma1     mean
##       0.5216  -0.0007
## s.e.  0.0396   0.0254
## 
## sigma^2 estimated as 0.1026:  log likelihood=-101.85
## AIC=209.7   AICc=209.76   BIC=221.4
fitted_model <- arima(random, order = c(9, 0, 0))

Best model we have above is the model with (p,d,q)=(9,0,0).

To compare lm and arima models, we should compare them for test period which is from 2021-05-11 to 2021-05-29.

train_start = as.Date("2020-05-25")
test_start = as.Date("2021-05-11")
test_end = as.Date("2021-05-29")

test_dates = seq(test_start, test_end, by = "day")
# forecast with lm model
forecast_with_lr = function(fmla, data, forecast_data) {
    fitted_lm = lm(as.formula(fmla), data)
    forecasted = predict(fitted_lm, forecast_data)
    return(list(forecast = as.numeric(forecasted), model = fitted_lm))
}

# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "log_sold", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(9, 0, 0))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}

We define our functions with respect to parameters we found above.Then,we forecast for test dates with our lm and arima models separately.

# loop over the test dates
forecast_ahead = 1


results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = maproducts[event_date <= current_date]
    forecast_data = maproducts[event_date == test_dates[i]]

    # first lm models
    fmla = "log_sold ~ as.factor(month) + price + category_sold + 
    lag_1 + lag_2 + outlier_great"
    forecasted = forecast_with_lr(fmla, past_data, forecast_data)
    forecast_data[, `:=`(lm_prediction, forecasted$forecast)]

    # arima model with auto.arima
    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "log_sold", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$log_sold"
## [1] "2021-05-12"
## [1] "input_series=data$log_sold"
## [1] "2021-05-13"
## [1] "input_series=data$log_sold"
## [1] "2021-05-14"
## [1] "input_series=data$log_sold"
## [1] "2021-05-15"
## [1] "input_series=data$log_sold"
## [1] "2021-05-16"
## [1] "input_series=data$log_sold"
## [1] "2021-05-17"
## [1] "input_series=data$log_sold"
## [1] "2021-05-18"
## [1] "input_series=data$log_sold"
## [1] "2021-05-19"
## [1] "input_series=data$log_sold"
## [1] "2021-05-20"
## [1] "input_series=data$log_sold"
## [1] "2021-05-21"
## [1] "input_series=data$log_sold"
## [1] "2021-05-22"
## [1] "input_series=data$log_sold"
## [1] "2021-05-23"
## [1] "input_series=data$log_sold"
## [1] "2021-05-24"
## [1] "input_series=data$log_sold"
## [1] "2021-05-25"
## [1] "input_series=data$log_sold"
## [1] "2021-05-26"
## [1] "input_series=data$log_sold"
## [1] "2021-05-27"
## [1] "input_series=data$log_sold"
## [1] "2021-05-28"
## [1] "input_series=data$log_sold"
## [1] "2021-05-29"
## [1] "input_series=data$log_sold"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "log_sold"), 
    c("lm_prediction", "arima_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date log_sold         variable    value
##  1: 2021-05-11 5.894403    lm_prediction 6.072349
##  2: 2021-05-12 6.257668    lm_prediction 6.219121
##  3: 2021-05-13 6.327937    lm_prediction 6.120971
##  4: 2021-05-14 6.238325    lm_prediction 6.264525
##  5: 2021-05-15 6.309918    lm_prediction 6.344856
##  6: 2021-05-16 6.272877    lm_prediction 6.087698
##  7: 2021-05-17 5.680173    lm_prediction 5.943813
##  8: 2021-05-18 5.707110    lm_prediction 5.858567
##  9: 2021-05-19 5.609472    lm_prediction 5.804912
## 10: 2021-05-20 5.541264    lm_prediction 5.701320
## 11: 2021-05-21 5.433722    lm_prediction 5.731810
## 12: 2021-05-22 5.564520    lm_prediction 5.811365
## 13: 2021-05-23 5.648974    lm_prediction 5.747404
## 14: 2021-05-24 5.480639    lm_prediction 5.650380
## 15: 2021-05-25 5.375278    lm_prediction 5.657188
## 16: 2021-05-26 5.463832    lm_prediction 5.727750
## 17: 2021-05-27 5.638355    lm_prediction 5.649980
## 18: 2021-05-28 5.480639    lm_prediction 5.783207
## 19: 2021-05-29 5.755742    lm_prediction 7.053687
## 20: 2021-05-11 5.894403 arima_prediction 6.582012
## 21: 2021-05-12 6.257668 arima_prediction 6.581654
## 22: 2021-05-13 6.327937 arima_prediction 6.582530
## 23: 2021-05-14 6.238325 arima_prediction 6.582511
## 24: 2021-05-15 6.309918 arima_prediction 6.581589
## 25: 2021-05-16 6.272877 arima_prediction 6.581623
## 26: 2021-05-17 5.680173 arima_prediction 6.581908
## 27: 2021-05-18 5.707110 arima_prediction 6.581376
## 28: 2021-05-19 5.609472 arima_prediction 6.581444
## 29: 2021-05-20 5.541264 arima_prediction 6.581291
## 30: 2021-05-21 5.433722 arima_prediction 6.580898
## 31: 2021-05-22 5.564520 arima_prediction 6.577849
## 32: 2021-05-23 5.648974 arima_prediction 6.577888
## 33: 2021-05-24 5.480639 arima_prediction 6.577618
## 34: 2021-05-25 5.375278 arima_prediction 6.577243
## 35: 2021-05-26 5.463832 arima_prediction 6.575795
## 36: 2021-05-27 5.638355 arima_prediction 6.575985
## 37: 2021-05-28 5.480639 arima_prediction 6.576003
## 38: 2021-05-29 5.755742 arima_prediction 6.574101
##     event_date log_sold         variable    value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(log_sold, value), by = list(variable)]

performance
##            variable  n     mean        sd         CV       FBias      MAPE
## 1:    lm_prediction 19 5.772676 0.3352264 0.05807123 -0.03236714 0.0408488
## 2: arima_prediction 19 5.772676 0.3352264 0.05807123 -0.13977345 0.1432785
##         RMSE       MAD      MADP     WMAPE
## 1: 0.3540892 0.2321811 0.0402207 0.0402207
## 2: 0.8697548 0.8068669 0.1397734 0.1397734

First view of data shows that lm prediction is better in terms of MAPE.However,let us see boxplot MAPE and FBias with respect to days of a week.

performance = melted_result[, accu(log_sold, value), by = list(event_date, 
    variable)]
performance[, `:=`(day_of_week, wday(event_date, label = T))]


ggplot(performance, aes(x = day_of_week, y = MAPE, fill = variable)) + 
    geom_boxplot()

ggplot(performance, aes(x = day_of_week, y = FBias, fill = variable)) + 
    geom_boxplot()

As we observe from table,MAPE values of lm prediction is less.They both tend to predict less.However,lm is better than arima in terms of FBias ,too.

So,we forecast with our lm model.Before that,we forecast our parameters by building arima model with auto arima function.

price_ts <- ts(maproducts$price, frequency = 7)
price_dec <- decompose(x = price_ts, type = "additive")
price_model = auto.arima(price_dec$random)
price_model_forecast <- predict(price_model, n.ahead = 15)$pred
seasonality = price_dec$seasonal[1:1]
last_trend_value <- tail(price_dec$trend[!is.na(price_dec$trend)], 
    1)
price_model_forecast = price_model_forecast + last_trend_value + 
    seasonality

model_cat <- auto.arima(maproducts$category_sold)
cat_fcast <- predict(model_cat, n.ahead = 15)$pred
model_lag_1 <- auto.arima(maproducts$lag_1)
lag1_fcast <- predict(model_lag_1, n.ahead = 15)$pred
lag_2_ts <- ts(maproducts$lag_2, frequency = 7)
lag_2_dec <- decompose(x = lag_2_ts, type = "additive")
lag_2_model = auto.arima(lag_2_dec$random)
lag_2_model_forecast <- predict(lag_2_model, n.ahead = 15)$pred
seasonality = lag_2_dec$seasonal[1:1]
last_trend_value <- tail(lag_2_dec$trend[!is.na(lag_2_dec$trend)], 
    1)
lag_2_model_forecast = lag_2_model_forecast + last_trend_value + 
    seasonality
outlier_ts <- ts(maproducts$outlier_great, frequency = 6)
outlier_dec <- decompose(x = outlier_ts, type = "additive")
outlier_model = auto.arima(outlier_dec$random)
outlier_model_forecast <- predict(outlier_model, n.ahead = 15)$pred
seasonality = outlier_dec$seasonal[1:1]
last_trend_value <- tail(outlier_dec$trend[!is.na(outlier_dec$trend)], 
    1)
outlier_model_forecast = outlier_model_forecast + last_trend_value + 
    seasonality

Now,we forecast our value with our forecasted parameters.

forecast_Log <- predict(lm_model2_1, data.frame(month = as.factor(6), 
    price = price_model_forecast[1], category_sold = cat_fcast[1], 
    lag_1 = lag1_fcast[1], lag_2 = lag_2_model_forecast[1], outlier_great = outlier_model_forecast[1]))

fcast <- exp(forecast_Log)

##Xiaomi Bluetooth Kulaklık

After filtering our whole data for Xiaomi Bluetooth Kulaklık,we plot graph for amount that has been sold.

maproducts <- data[data$product_content_id == 6676673]
require(ggplot2)
ggplot(maproducts, aes(x = event_date, y = sold_count)) + geom_line()

That plot has wide variance, which urges us to take logarithm of sale data.Moreover, we add day,month ,trend data for further process.Then,we plot log of sold products over time to see if we decrease variance or not.

maproducts[, `:=`(month, month(event_date))]
maproducts[, `:=`(day, lubridate::wday(event_date))]

head(maproducts, 10)
##     event_date product_content_id    price sold_count visit_count basket_count
##  1: 2021-05-31            6676673 120.6812        345       19578         1261
##  2: 2021-05-30            6676673 122.3905        342       16238         1214
##  3: 2021-05-29            6676673 123.7674        286       15222         1063
##  4: 2021-05-28            6676673 126.5967        258       13615          966
##  5: 2021-05-27            6676673 126.6857        268       12907          937
##  6: 2021-05-26            6676673 129.5155        233       13614          942
##  7: 2021-05-25            6676673 130.1068        280       15730         1160
##  8: 2021-05-24            6676673 122.8083        384       18266         1467
##  9: 2021-05-23            6676673 111.5975        562       26286         2069
## 10: 2021-05-22            6676673 114.6897        535       24943         1862
##     favored_count category_sold category_visits category_basket
##  1:          1510          4944          306462           23418
##  2:           947          4442          298876           21479
##  3:           880          4437          286091           21416
##  4:           657           647            4604          270790
##  5:           638           703            4637          273490
##  6:           657           746            4842          276308
##  7:           810           726            4719          283543
##  8:          1032           809            5140          288733
##  9:          1560           982            5304          347131
## 10:          1342           979            5378          339929
##     category_favored category_brand_sold ty_visits month day
##  1:            26597                 786 125439876     5   2
##  2:            23809                 696 131821083     5   1
##  3:            20727                 653 129670029     5   7
##  4:            19955               19556 103514886     5   6
##  5:            20959               20972 107391579     5   5
##  6:            21197               18830 106195988     5   4
##  7:            21665               18239 107004119     5   3
##  8:            22203               20886 108235639     5   2
##  9:            25978               26571 134993625     5   1
## 10:            24575               22101 133292217     5   7
maproducts[, `:=`(trend, 1:.N)]
maproducts[, `:=`(log_sold, log(sold_count))]
ggplot(maproducts, aes(x = event_date, y = log_sold)) + geom_line()

Variance of log of sold products seems more stable .Therefore,we start building linear regression models with logarithmic data.

lm_model <- lm(maproducts, formula = log_sold ~ month + day + 
    price + visit_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + day + price + visit_count, data = maproducts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2728 -0.2746  0.0323  0.2611  1.3832 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.661e+00  2.943e-01  32.823   <2e-16 ***
## month        1.147e-02  8.258e-03   1.389    0.166    
## day         -4.220e-03  1.147e-02  -0.368    0.713    
## price       -2.836e-02  2.135e-03 -13.284   <2e-16 ***
## visit_count -1.371e-06  2.568e-06  -0.534    0.594    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4417 on 367 degrees of freedom
## Multiple R-squared:  0.3684, Adjusted R-squared:  0.3615 
## F-statistic: 53.51 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28501 -0.27663  0.03077  0.26458  1.38308 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.639e+00  2.877e-01  33.499   <2e-16 ***
## month        1.140e-02  8.246e-03   1.383    0.168    
## price       -2.832e-02  2.129e-03 -13.300   <2e-16 ***
## visit_count -1.374e-06  2.565e-06  -0.536    0.593    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4412 on 368 degrees of freedom
## Multiple R-squared:  0.3681, Adjusted R-squared:  0.363 
## F-statistic: 71.46 on 3 and 368 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04989 -0.26063 -0.01178  0.27438  1.50117 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.017e+00  2.769e-01  32.562  < 2e-16 ***
## month        3.444e-03  7.683e-03   0.448    0.654    
## price       -2.711e-02  1.973e-03 -13.744  < 2e-16 ***
## visit_count  1.311e-05  2.982e-06   4.398 1.43e-05 ***
## trend        2.216e-03  2.767e-04   8.007 1.57e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4076 on 367 degrees of freedom
## Multiple R-squared:  0.4621, Adjusted R-squared:  0.4562 
## F-statistic: 78.82 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.90224 -0.24750  0.00356  0.21453  1.59387 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.907e+00  2.558e-01  34.819  < 2e-16 ***
## month          7.292e-03  7.103e-03   1.027    0.305    
## price         -2.805e-02  1.823e-03 -15.385  < 2e-16 ***
## visit_count    3.372e-06  3.002e-06   1.123    0.262    
## trend          2.809e-03  2.656e-04  10.578  < 2e-16 ***
## favored_count  2.457e-04  3.039e-05   8.086 9.14e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.376 on 366 degrees of freedom
## Multiple R-squared:  0.5436, Adjusted R-squared:  0.5374 
## F-statistic: 87.19 on 5 and 366 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + favored_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + favored_count, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.23099 -0.26339  0.01086  0.24428  1.42204 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.674e+00  2.799e-01  34.563  < 2e-16 ***
## month          1.522e-02  8.060e-03   1.888  0.05975 .  
## price         -2.912e-02  2.078e-03 -14.017  < 2e-16 ***
## visit_count   -1.007e-05  3.104e-06  -3.244  0.00129 ** 
## favored_count  1.568e-04  3.333e-05   4.706 3.59e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.429 on 367 degrees of freedom
## Multiple R-squared:  0.4041, Adjusted R-squared:  0.3976 
## F-statistic: 62.21 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    trend + favored_count)

summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + trend + favored_count, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92261 -0.24026  0.00556  0.20907  1.59245 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.028e+00  2.321e-01  38.897   <2e-16 ***
## month          5.767e-03  6.974e-03   0.827    0.409    
## price         -2.860e-02  1.758e-03 -16.266   <2e-16 ***
## trend          2.683e-03  2.407e-04  11.147   <2e-16 ***
## favored_count  2.594e-04  2.785e-05   9.316   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3761 on 367 degrees of freedom
## Multiple R-squared:  0.542,  Adjusted R-squared:  0.5371 
## F-statistic: 108.6 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.80049 -0.19063  0.02167  0.19808  0.90490 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.770e+00  2.384e-01  32.590  < 2e-16 ***
## month          6.100e-03  6.055e-03   1.008    0.314    
## price         -2.271e-02  1.619e-03 -14.025  < 2e-16 ***
## visit_count    1.791e-06  2.563e-06   0.699    0.485    
## trend          3.141e-03  2.281e-04  13.770  < 2e-16 ***
## favored_count  2.071e-04  2.611e-05   7.932 2.66e-14 ***
## category_sold  4.516e-04  3.834e-05  11.780  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3205 on 365 degrees of freedom
## Multiple R-squared:  0.6693, Adjusted R-squared:  0.6639 
## F-statistic: 123.1 on 6 and 365 DF,  p-value: < 2.2e-16

After evaluation of different models parameters of which is included in data, we choose model formula of which is ‘maproducts,formula=log_sold~month+price+visit_count+trend+favored_count+category_sold’. Furhermore,lag variables could violate our model.For that reason, we add lag1 and lag2 variables to model and examine their effectiveness.

maproducts[, `:=`(lag_1, NA)]
maproducts$lag_1[2:372] <- maproducts$log_sold[1:371]
maproducts[, `:=`(lag_2, NA)]
maproducts$lag_2[3:372] <- maproducts$log_sold[1:370]
# View(maproducts)
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold + lag_1)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold + lag_1, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.41945 -0.13333  0.00662  0.16028  0.90177 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.280e+00  3.078e-01  13.904  < 2e-16 ***
## month          7.230e-03  4.687e-03   1.543    0.124    
## price         -1.341e-02  1.407e-03  -9.537  < 2e-16 ***
## visit_count    1.161e-06  1.984e-06   0.585    0.559    
## trend          1.916e-03  1.978e-04   9.687  < 2e-16 ***
## favored_count  1.136e-04  2.115e-05   5.369 1.42e-07 ***
## category_sold  3.894e-04  3.564e-05  10.926  < 2e-16 ***
## lag_1          4.403e-01  3.277e-02  13.434  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.248 on 363 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8031, Adjusted R-squared:  0.7993 
## F-statistic: 211.6 on 7 and 363 DF,  p-value: < 2.2e-16
# 10.06
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    trend + category_sold + lag_1 + lag_2 + as.factor(day))
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold + 
##     lag_1 + lag_2 + as.factor(day), data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.87893 -0.10671  0.01649  0.14189  0.75873 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.091e+00  3.122e-01  13.105   <2e-16 ***
## month            3.777e-03  4.557e-03   0.829   0.4077    
## price           -1.247e-02  1.375e-03  -9.069   <2e-16 ***
## trend            1.347e-03  1.530e-04   8.807   <2e-16 ***
## category_sold    5.021e-04  4.103e-05  12.235   <2e-16 ***
## lag_1            4.678e-01  4.634e-02  10.095   <2e-16 ***
## lag_2           -7.137e-03  4.096e-02  -0.174   0.8618    
## as.factor(day)2  6.832e-02  4.847e-02   1.410   0.1595    
## as.factor(day)3  7.713e-02  4.868e-02   1.584   0.1140    
## as.factor(day)4  6.508e-02  4.881e-02   1.333   0.1833    
## as.factor(day)5  9.323e-02  4.924e-02   1.893   0.0591 .  
## as.factor(day)6  8.733e-02  4.892e-02   1.785   0.0751 .  
## as.factor(day)7  4.854e-02  4.898e-02   0.991   0.3223    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.247 on 357 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8079, Adjusted R-squared:  0.8014 
## F-statistic: 125.1 on 12 and 357 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    trend + category_sold + lag_1 + lag_2)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold + 
##     lag_1 + lag_2, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.94258 -0.11454  0.02477  0.14676  0.76518 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.180e+00  3.023e-01  13.830   <2e-16 ***
## month          3.848e-03  4.548e-03   0.846    0.398    
## price         -1.248e-02  1.355e-03  -9.209   <2e-16 ***
## trend          1.372e-03  1.520e-04   9.023   <2e-16 ***
## category_sold  5.171e-04  4.021e-05  12.859   <2e-16 ***
## lag_1          4.697e-01  4.532e-02  10.364   <2e-16 ***
## lag_2         -1.634e-02  4.016e-02  -0.407    0.684    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2466 on 363 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8053, Adjusted R-squared:  0.8021 
## F-statistic: 250.2 on 6 and 363 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold + lag_1 + 
    lag_2)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold + lag_1 + lag_2, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.79701 -0.12755  0.02016  0.15447  0.73915 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.472e+00  3.012e-01  14.847  < 2e-16 ***
## month          7.660e-03  4.480e-03   1.710   0.0881 .  
## price         -1.359e-02  1.345e-03 -10.107  < 2e-16 ***
## visit_count    6.683e-07  1.892e-06   0.353   0.7241    
## trend          1.973e-03  1.891e-04  10.434  < 2e-16 ***
## favored_count  1.078e-04  2.026e-05   5.319 1.83e-07 ***
## category_sold  4.975e-04  3.875e-05  12.839  < 2e-16 ***
## lag_1          4.310e-01  4.392e-02   9.812  < 2e-16 ***
## lag_2         -3.568e-02  3.862e-02  -0.924   0.3562    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2363 on 361 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8222, Adjusted R-squared:  0.8183 
## F-statistic: 208.7 on 8 and 361 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold + lag_1 + 
    lag_2 + category_favored)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold + lag_1 + lag_2 + category_favored, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.45142 -0.13499  0.02015  0.16021  0.70440 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.791e+00  3.096e-01  15.473  < 2e-16 ***
## month             4.512e-03  4.496e-03   1.004 0.316276    
## price            -1.558e-02  1.436e-03 -10.848  < 2e-16 ***
## visit_count       1.081e-06  1.866e-06   0.579 0.562858    
## trend             2.321e-03  2.102e-04  11.043  < 2e-16 ***
## favored_count     1.028e-04  1.999e-05   5.143 4.44e-07 ***
## category_sold     4.141e-04  4.473e-05   9.257  < 2e-16 ***
## lag_1             4.056e-01  4.381e-02   9.258  < 2e-16 ***
## lag_2            -4.091e-02  3.804e-02  -1.076 0.282825    
## category_favored  6.939e-06  1.947e-06   3.565 0.000413 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2325 on 360 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8283, Adjusted R-squared:  0.824 
## F-statistic: 192.9 on 9 and 360 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + trend + favored_count + category_sold + lag_1 + lag_2 + 
    category_favored)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + lag_1 + lag_2 + category_favored, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.57855 -0.10165  0.01745  0.12739  0.45999 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.427e+00  3.377e-01  19.033  < 2e-16 ***
## as.factor(month)2  -3.978e-02  5.577e-02  -0.713 0.476174    
## as.factor(month)3  -4.196e-01  6.367e-02  -6.590 1.61e-10 ***
## as.factor(month)4  -6.162e-02  6.030e-02  -1.022 0.307532    
## as.factor(month)5   6.037e-02  5.819e-02   1.037 0.300258    
## as.factor(month)6  -6.674e-02  7.854e-02  -0.850 0.396061    
## as.factor(month)7  -1.574e-01  7.971e-02  -1.974 0.049147 *  
## as.factor(month)8  -1.890e-03  7.056e-02  -0.027 0.978642    
## as.factor(month)9  -2.375e-01  6.819e-02  -3.483 0.000558 ***
## as.factor(month)10 -6.981e-02  6.753e-02  -1.034 0.301980    
## as.factor(month)11 -1.906e-01  6.879e-02  -2.772 0.005876 ** 
## as.factor(month)12  2.509e-02  5.475e-02   0.458 0.647024    
## price              -2.143e-02  1.628e-03 -13.166  < 2e-16 ***
## trend               2.666e-03  2.762e-04   9.650  < 2e-16 ***
## favored_count       6.740e-05  2.070e-05   3.256 0.001238 ** 
## category_sold       4.090e-04  4.074e-05  10.041  < 2e-16 ***
## lag_1               2.958e-01  4.085e-02   7.240 2.85e-12 ***
## lag_2              -8.038e-02  3.479e-02  -2.310 0.021440 *  
## category_favored    1.260e-05  2.030e-06   6.207 1.52e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.208 on 351 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.866,  Adjusted R-squared:  0.8591 
## F-statistic:   126 on 18 and 351 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + trend + favored_count + category_sold + lag_1 + lag_2 + 
    category_favored)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + lag_1 + lag_2 + category_favored, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.57855 -0.10165  0.01745  0.12739  0.45999 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.427e+00  3.377e-01  19.033  < 2e-16 ***
## as.factor(month)2  -3.978e-02  5.577e-02  -0.713 0.476174    
## as.factor(month)3  -4.196e-01  6.367e-02  -6.590 1.61e-10 ***
## as.factor(month)4  -6.162e-02  6.030e-02  -1.022 0.307532    
## as.factor(month)5   6.037e-02  5.819e-02   1.037 0.300258    
## as.factor(month)6  -6.674e-02  7.854e-02  -0.850 0.396061    
## as.factor(month)7  -1.574e-01  7.971e-02  -1.974 0.049147 *  
## as.factor(month)8  -1.890e-03  7.056e-02  -0.027 0.978642    
## as.factor(month)9  -2.375e-01  6.819e-02  -3.483 0.000558 ***
## as.factor(month)10 -6.981e-02  6.753e-02  -1.034 0.301980    
## as.factor(month)11 -1.906e-01  6.879e-02  -2.772 0.005876 ** 
## as.factor(month)12  2.509e-02  5.475e-02   0.458 0.647024    
## price              -2.143e-02  1.628e-03 -13.166  < 2e-16 ***
## trend               2.666e-03  2.762e-04   9.650  < 2e-16 ***
## favored_count       6.740e-05  2.070e-05   3.256 0.001238 ** 
## category_sold       4.090e-04  4.074e-05  10.041  < 2e-16 ***
## lag_1               2.958e-01  4.085e-02   7.240 2.85e-12 ***
## lag_2              -8.038e-02  3.479e-02  -2.310 0.021440 *  
## category_favored    1.260e-05  2.030e-06   6.207 1.52e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.208 on 351 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.866,  Adjusted R-squared:  0.8591 
## F-statistic:   126 on 18 and 351 DF,  p-value: < 2.2e-16

That modification gives us better model with respect to r-squared value.Our new best model has formula that is ‘log_sold ~ as.factor(month) + price + trend + favored_count + category_sold + lag_1 + lag_2 + category_favored’.

From the first view of logarithm of sold products,we may predict outliers that breaks our models’ goodness.Let us put them into model.

maproducts[, `:=`(residuals, NA)]
maproducts$residuals[2:372] <- lm_model$residuals
## Warning in maproducts$residuals[2:372] <- lm_model$residuals: number of items to
## replace is not a multiple of replacement length
maproducts[!is.na(residuals), `:=`(quant5, quantile(residuals, 
    0.05))]
maproducts[!is.na(residuals), `:=`(quant95, quantile(residuals, 
    0.95))]
maproducts[, `:=`(outlier_small, as.numeric(residuals < quant5))]
maproducts[, `:=`(outlier_great, as.numeric(residuals > quant95))]

lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + category_sold + lag_1 + lag_2 + outlier_small + outlier_great)
summary(lm_model2)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold + 
##     lag_1 + lag_2 + outlier_small + outlier_great, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10580 -0.10063  0.00898  0.12142  0.70804 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.496e+00  3.672e-01  14.969  < 2e-16 ***
## as.factor(month)2  -9.901e-02  6.328e-02  -1.565 0.118562    
## as.factor(month)3  -4.949e-01  6.918e-02  -7.154 4.91e-12 ***
## as.factor(month)4  -2.390e-01  6.608e-02  -3.617 0.000342 ***
## as.factor(month)5  -1.460e-01  6.171e-02  -2.367 0.018490 *  
## as.factor(month)6   2.617e-01  6.317e-02   4.143 4.30e-05 ***
## as.factor(month)7   7.182e-02  6.589e-02   1.090 0.276475    
## as.factor(month)8   1.703e-01  6.220e-02   2.738 0.006498 ** 
## as.factor(month)9  -4.765e-02  6.332e-02  -0.753 0.452212    
## as.factor(month)10  8.910e-02  6.777e-02   1.315 0.189405    
## as.factor(month)11 -1.008e-02  6.394e-02  -0.158 0.874822    
## as.factor(month)12  1.143e-01  6.225e-02   1.835 0.067284 .  
## price              -1.698e-02  1.783e-03  -9.519  < 2e-16 ***
## category_sold       5.078e-04  3.987e-05  12.738  < 2e-16 ***
## lag_1               4.175e-01  4.503e-02   9.271  < 2e-16 ***
## lag_2              -3.025e-02  3.995e-02  -0.757 0.449330    
## outlier_small      -6.693e-03  5.862e-02  -0.114 0.909157    
## outlier_great       8.211e-02  5.909e-02   1.390 0.165539    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2383 on 352 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8237, Adjusted R-squared:  0.8152 
## F-statistic: 96.73 on 17 and 352 DF,  p-value: < 2.2e-16
lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + category_sold + lag_1 + lag_2 + outlier_great)
summary(lm_model2)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold + 
##     lag_1 + lag_2 + outlier_great, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10468 -0.10017  0.00919  0.12154  0.70849 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.495e+00  3.666e-01  14.992  < 2e-16 ***
## as.factor(month)2  -9.956e-02  6.300e-02  -1.580 0.114974    
## as.factor(month)3  -4.953e-01  6.900e-02  -7.178 4.20e-12 ***
## as.factor(month)4  -2.396e-01  6.579e-02  -3.642 0.000311 ***
## as.factor(month)5  -1.466e-01  6.145e-02  -2.386 0.017576 *  
## as.factor(month)6   2.614e-01  6.302e-02   4.148 4.21e-05 ***
## as.factor(month)7   7.168e-02  6.579e-02   1.090 0.276650    
## as.factor(month)8   1.702e-01  6.211e-02   2.741 0.006440 ** 
## as.factor(month)9  -4.800e-02  6.315e-02  -0.760 0.447748    
## as.factor(month)10  8.862e-02  6.754e-02   1.312 0.190332    
## as.factor(month)11 -1.066e-02  6.364e-02  -0.168 0.867018    
## as.factor(month)12  1.137e-01  6.198e-02   1.835 0.067399 .  
## price              -1.697e-02  1.779e-03  -9.536  < 2e-16 ***
## category_sold       5.077e-04  3.979e-05  12.758  < 2e-16 ***
## lag_1               4.179e-01  4.485e-02   9.316  < 2e-16 ***
## lag_2              -3.063e-02  3.976e-02  -0.770 0.441591    
## outlier_great       8.251e-02  5.890e-02   1.401 0.162089    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.238 on 353 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8237, Adjusted R-squared:  0.8157 
## F-statistic: 103.1 on 16 and 353 DF,  p-value: < 2.2e-16
lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + trend + favored_count + category_sold + lag_1 + lag_2 + 
    category_favored + outlier_small + outlier_great)
summary(lm_model2)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + lag_1 + lag_2 + category_favored + outlier_small + 
##     outlier_great, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.55511 -0.10489  0.02323  0.12605  0.49855 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.620e+00  3.392e-01  19.512  < 2e-16 ***
## as.factor(month)2  -4.963e-02  5.540e-02  -0.896 0.370961    
## as.factor(month)3  -4.435e-01  6.345e-02  -6.990 1.40e-11 ***
## as.factor(month)4  -7.622e-02  6.007e-02  -1.269 0.205321    
## as.factor(month)5   4.010e-02  5.807e-02   0.691 0.490274    
## as.factor(month)6  -8.651e-02  7.789e-02  -1.111 0.267442    
## as.factor(month)7  -1.865e-01  7.932e-02  -2.352 0.019252 *  
## as.factor(month)8  -1.804e-02  6.995e-02  -0.258 0.796578    
## as.factor(month)9  -2.540e-01  6.761e-02  -3.756 0.000202 ***
## as.factor(month)10 -8.392e-02  6.693e-02  -1.254 0.210702    
## as.factor(month)11 -1.962e-01  6.803e-02  -2.884 0.004173 ** 
## as.factor(month)12  3.032e-02  5.426e-02   0.559 0.576682    
## price              -2.252e-02  1.646e-03 -13.687  < 2e-16 ***
## trend               2.804e-03  2.765e-04  10.142  < 2e-16 ***
## favored_count       6.810e-05  2.051e-05   3.321 0.000991 ***
## category_sold       4.094e-04  4.027e-05  10.165  < 2e-16 ***
## lag_1               2.718e-01  4.114e-02   6.606 1.47e-10 ***
## lag_2              -6.826e-02  3.466e-02  -1.970 0.049673 *  
## category_favored    1.291e-05  2.011e-06   6.418 4.52e-10 ***
## outlier_small      -3.120e-02  5.080e-02  -0.614 0.539580    
## outlier_great       1.597e-01  5.157e-02   3.096 0.002123 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2056 on 349 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8698, Adjusted R-squared:  0.8624 
## F-statistic: 116.6 on 20 and 349 DF,  p-value: < 2.2e-16
lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + trend + favored_count + category_sold + lag_1 + lag_2 + 
    category_favored + outlier_great)
summary(lm_model2)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + lag_1 + lag_2 + category_favored + outlier_great, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.55181 -0.10312  0.02275  0.12664  0.49794 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.609e+00  3.385e-01  19.524  < 2e-16 ***
## as.factor(month)2  -5.195e-02  5.522e-02  -0.941 0.347438    
## as.factor(month)3  -4.444e-01  6.338e-02  -7.012 1.22e-11 ***
## as.factor(month)4  -7.916e-02  5.982e-02  -1.323 0.186609    
## as.factor(month)5   3.797e-02  5.792e-02   0.656 0.512502    
## as.factor(month)6  -8.605e-02  7.781e-02  -1.106 0.269553    
## as.factor(month)7  -1.853e-01  7.922e-02  -2.339 0.019912 *  
## as.factor(month)8  -1.673e-02  6.985e-02  -0.239 0.810870    
## as.factor(month)9  -2.538e-01  6.755e-02  -3.758 0.000201 ***
## as.factor(month)10 -8.449e-02  6.686e-02  -1.264 0.207196    
## as.factor(month)11 -1.966e-01  6.796e-02  -2.893 0.004053 ** 
## as.factor(month)12  2.804e-02  5.408e-02   0.518 0.604496    
## price              -2.246e-02  1.641e-03 -13.688  < 2e-16 ***
## trend               2.797e-03  2.760e-04  10.134  < 2e-16 ***
## favored_count       6.891e-05  2.045e-05   3.370 0.000835 ***
## category_sold       4.091e-04  4.023e-05  10.168  < 2e-16 ***
## lag_1               2.739e-01  4.095e-02   6.690 8.86e-11 ***
## lag_2              -6.992e-02  3.452e-02  -2.025 0.043598 *  
## category_favored    1.283e-05  2.006e-06   6.398 5.05e-10 ***
## outlier_great       1.613e-01  5.145e-02   3.136 0.001859 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2055 on 350 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8697, Adjusted R-squared:  0.8626 
## F-statistic: 122.9 on 19 and 350 DF,  p-value: < 2.2e-16

In this model,we can add week variable to model to examine its effectiveness in the model.

# weekly

maproducts[, `:=`(weeks, week(event_date))]

head(maproducts)
##    event_date product_content_id    price sold_count visit_count basket_count
## 1: 2021-05-31            6676673 120.6812        345       19578         1261
## 2: 2021-05-30            6676673 122.3905        342       16238         1214
## 3: 2021-05-29            6676673 123.7674        286       15222         1063
## 4: 2021-05-28            6676673 126.5967        258       13615          966
## 5: 2021-05-27            6676673 126.6857        268       12907          937
## 6: 2021-05-26            6676673 129.5155        233       13614          942
##    favored_count category_sold category_visits category_basket category_favored
## 1:          1510          4944          306462           23418            26597
## 2:           947          4442          298876           21479            23809
## 3:           880          4437          286091           21416            20727
## 4:           657           647            4604          270790            19955
## 5:           638           703            4637          273490            20959
## 6:           657           746            4842          276308            21197
##    category_brand_sold ty_visits month day trend log_sold    lag_1    lag_2
## 1:                 786 125439876     5   2     1 5.843544       NA       NA
## 2:                 696 131821083     5   1     2 5.834811 5.843544       NA
## 3:                 653 129670029     5   7     3 5.655992 5.834811 5.843544
## 4:               19556 103514886     5   6     4 5.552960 5.655992 5.834811
## 5:               20972 107391579     5   5     5 5.590987 5.552960 5.655992
## 6:               18830 106195988     5   4     6 5.451038 5.590987 5.552960
##       residuals     quant5   quant95 outlier_small outlier_great weeks
## 1:           NA         NA        NA            NA            NA    22
## 2: -1.578550704 -0.2772592 0.2830867             1             0    22
## 3:  0.003586055 -0.2772592 0.2830867             0             0    22
## 4:  0.022684073 -0.2772592 0.2830867             0             0    22
## 5: -0.100691835 -0.2772592 0.2830867             0             0    21
## 6:  0.129492494 -0.2772592 0.2830867             0             0    21
lm_model3_2 <- lm(maproducts, formula = log_sold ~ as.factor(month) + 
    price + trend + favored_count + category_sold + lag_1 + lag_2 + 
    category_favored + outlier_great + weeks)
summary(lm_model3_2)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + lag_1 + lag_2 + category_favored + outlier_great + 
##     weeks, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.51234 -0.10238  0.01298  0.12121  0.50255 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.779e+00  3.494e-01  19.401  < 2e-16 ***
## as.factor(month)2   1.490e-02  6.564e-02   0.227 0.820555    
## as.factor(month)3  -3.178e-01  9.260e-02  -3.432 0.000670 ***
## as.factor(month)4   1.273e-01  1.256e-01   1.014 0.311275    
## as.factor(month)5   3.190e-01  1.611e-01   1.980 0.048466 *  
## as.factor(month)6   2.550e-01  1.983e-01   1.286 0.199332    
## as.factor(month)7   2.293e-01  2.355e-01   0.974 0.330940    
## as.factor(month)8   4.673e-01  2.682e-01   1.742 0.082363 .  
## as.factor(month)9   2.959e-01  3.018e-01   0.980 0.327556    
## as.factor(month)10  5.355e-01  3.384e-01   1.582 0.114484    
## as.factor(month)11  4.975e-01  3.776e-01   1.317 0.188540    
## as.factor(month)12  7.966e-01  4.148e-01   1.920 0.055638 .  
## price              -2.275e-02  1.642e-03 -13.853  < 2e-16 ***
## trend               2.832e-03  2.757e-04  10.273  < 2e-16 ***
## favored_count       6.908e-05  2.037e-05   3.391 0.000777 ***
## category_sold       4.058e-04  4.013e-05  10.112  < 2e-16 ***
## lag_1               2.722e-01  4.081e-02   6.670 1.00e-10 ***
## lag_2              -8.312e-02  3.512e-02  -2.367 0.018482 *  
## category_favored    1.293e-05  2.000e-06   6.468 3.36e-10 ***
## outlier_great       1.577e-01  5.131e-02   3.074 0.002276 ** 
## weeks              -1.612e-02  8.625e-03  -1.869 0.062528 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2047 on 349 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.871,  Adjusted R-squared:  0.8636 
## F-statistic: 117.8 on 20 and 349 DF,  p-value: < 2.2e-16
# pick model 3

According to r-squared value,week variable makes contribution to our model. For another perspective AIC values of models.

require(ursa)
AIC(lm_model)
## [1] -91.34382
AIC(lm_model2)
## [1] -99.59528
AIC(lm_model3_2)
## [1] -101.2783

That proves claim of model with month as factor variable.

Now,It is time for us to come up with arima models.Firstly,we build time series with frequency=7.Plot that.

# time series and arima
ts_sold <- ts(maproducts$log_sold, frequency = 7)
ts.plot(ts_sold)

Box.test(ts_sold, lag = 7, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_sold
## X-squared = 700.49, df = 7, p-value < 2.2e-16

Variance is stable enough thanks to taking logarithm of data.Then, decompose time series data with additive type. Plot decomposed data

sold_decomp <- decompose(ts_sold, type = "additive")
plot(sold_decomp)

Random part is kind of stationary except some outlier points. We shall observe autocorrelation and partial autocorrelation of random part.

random = sold_decomp$random
# random part looks really fine.
acf(sold_decomp$random, na.action = na.pass)

pacf(sold_decomp$random, na.action = na.pass)

As we observe from those graphs,we start building arima models with (p,d,q)=(2,0,0) and increase p value to 10 one by one.After this process,we find model with auto arima function. Pick the best model.

ar_model <- arima(random, order = c(2, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(2, 0, 0))
## 
## Coefficients:
##          ar1      ar2  intercept
##       0.3298  -0.2968    -0.0009
## s.e.  0.0499   0.0498     0.0133
## 
## sigma^2 estimated as 0.0601:  log likelihood = -4.89,  aic = 17.79
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -8.385561e-05 0.2451437 0.1740224 54.28193 145.6099 0.7532093
##                     ACF1
## Training set -0.09301317
ar_model <- arima(random, order = c(3, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(3, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3  intercept
##       0.2346  -0.1924  -0.3165    -0.0005
## s.e.  0.0495   0.0499   0.0495     0.0096
## 
## sigma^2 estimated as 0.05402:  log likelihood = 14.45,  aic = -18.9
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0004537202 0.2324248 0.1687795 62.61691 194.2566 0.7305164
##                    ACF1
## Training set -0.0294018
ar_model <- arima(random, order = c(4, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(4, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3     ar4  intercept
##       0.2033  -0.2116  -0.2938  -0.098    -0.0004
## s.e.  0.0520   0.0508   0.0507   0.052     0.0087
## 
## sigma^2 estimated as 0.0535:  log likelihood = 16.21,  aic = -20.43
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0005966523 0.2312937 0.1683734 83.18963 180.3084 0.7287591
##                     ACF1
## Training set -0.02297017
ar_model <- arima(random, order = c(5, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(5, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5  intercept
##       0.1784  -0.2863  -0.3477  -0.0472  -0.2518    -0.0002
## s.e.  0.0506   0.0513   0.0502   0.0513   0.0505     0.0067
## 
## sigma^2 estimated as 0.05005:  log likelihood = 28.24,  aic = -42.48
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0008350554 0.2237166 0.1612522 64.79473 189.4283 0.6979366
##                     ACF1
## Training set -0.04320653
ar_model <- arima(random, order = c(6, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(6, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ar6  intercept
##       0.1335  -0.2945  -0.4094  -0.0979  -0.2206  -0.1781    -0.0002
## s.e.  0.0514   0.0505   0.0526   0.0526   0.0504   0.0515     0.0056
## 
## sigma^2 estimated as 0.04844:  log likelihood = 34.13,  aic = -52.25
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE     MASE
## Training set -0.0007877407 0.2200882 0.1584303 47.56604 183.0987 0.685723
##                      ACF1
## Training set -0.006392688
ar_model <- arima(random, order = c(7, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(7, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ar6      ar7  intercept
##       0.1259  -0.3039  -0.4135  -0.1153  -0.2332  -0.1728  -0.0428    -0.0002
## s.e.  0.0522   0.0518   0.0528   0.0567   0.0527   0.0518   0.0523     0.0054
## 
## sigma^2 estimated as 0.04835:  log likelihood = 34.46,  aic = -50.92
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0007593015 0.2198832 0.1577712 50.28049 177.2777 0.6828702
##                      ACF1
## Training set -0.003008649
ar_model9 <- arima(random, order = c(8, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(7, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ar6      ar7  intercept
##       0.1259  -0.3039  -0.4135  -0.1153  -0.2332  -0.1728  -0.0428    -0.0002
## s.e.  0.0522   0.0518   0.0528   0.0567   0.0527   0.0518   0.0523     0.0054
## 
## sigma^2 estimated as 0.04835:  log likelihood = 34.46,  aic = -50.92
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0007593015 0.2198832 0.1577712 50.28049 177.2777 0.6828702
##                      ACF1
## Training set -0.003008649

Best model we have above is the model with (p,d,q)=(8,0,0).Then,we use auto arima function to compare our model with the model auto arima functions finds.

auto.arima(random, seasonal = FALSE, trace = TRUE)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)           with non-zero mean : Inf
##  ARIMA(0,0,0)           with non-zero mean : 72.16319
##  ARIMA(1,0,0)           with non-zero mean : 50.53033
##  ARIMA(0,0,1)           with non-zero mean : 39.66487
##  ARIMA(0,0,0)           with zero mean     : 70.14585
##  ARIMA(1,0,1)           with non-zero mean : 42.61354
##  ARIMA(0,0,2)           with non-zero mean : 41.70325
##  ARIMA(1,0,2)           with non-zero mean : Inf
##  ARIMA(0,0,1)           with zero mean     : 37.63635
##  ARIMA(1,0,1)           with zero mean     : 40.57179
##  ARIMA(0,0,2)           with zero mean     : 39.6635
##  ARIMA(1,0,0)           with zero mean     : 48.4987
##  ARIMA(1,0,2)           with zero mean     : Inf
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(0,0,1)           with zero mean     : 37.73032
## 
##  Best model: ARIMA(0,0,1)           with zero mean
## Series: random 
## ARIMA(0,0,1) with zero mean 
## 
## Coefficients:
##          ma1
##       0.3147
## s.e.  0.0456
## 
## sigma^2 estimated as 0.06435:  log likelihood=-16.85
## AIC=37.7   AICc=37.73   BIC=45.5
fitted_model <- arima(random, order = c(8, 0, 0))
AIC(fitted_model)
## [1] -52.42999

Our model is better off.

To compare lm and arima models, we should compare them for test period which is from 2021-05-11 to 2021-05-29.

train_start = as.Date("2020-05-25")
test_start = as.Date("2021-05-11")
test_end = as.Date("2021-05-29")

test_dates = seq(test_start, test_end, by = "day")
test_dates
##  [1] "2021-05-11" "2021-05-12" "2021-05-13" "2021-05-14" "2021-05-15"
##  [6] "2021-05-16" "2021-05-17" "2021-05-18" "2021-05-19" "2021-05-20"
## [11] "2021-05-21" "2021-05-22" "2021-05-23" "2021-05-24" "2021-05-25"
## [16] "2021-05-26" "2021-05-27" "2021-05-28" "2021-05-29"
# forecast with lm model
forecast_with_lr = function(fmla, data, forecast_data) {
    fitted_lm = lm(as.formula(fmla), data)
    forecasted = predict(fitted_lm, forecast_data)
    return(list(forecast = as.numeric(forecasted), model = fitted_lm))
}

# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "log_sold", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(8, 0, 0))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}

We define our functions with respect to parameters we found above.Then,we forecast for test dates with our lm and arima models separately.

# loop over the test dates
forecast_ahead = 1


results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = maproducts[event_date <= current_date]
    forecast_data = maproducts[event_date == test_dates[i]]

    # first lm models
    fmla = "log_sold~as.factor(month)+price+trend+favored_count+category_sold+lag_1+lag_2+category_favored+outlier_great+weeks"
    forecasted = forecast_with_lr(fmla, past_data, forecast_data)
    forecast_data[, `:=`(lm_prediction, forecasted$forecast)]

    # arima model with auto.arima
    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "log_sold", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$log_sold"
## [1] "2021-05-12"
## [1] "input_series=data$log_sold"
## [1] "2021-05-13"
## [1] "input_series=data$log_sold"
## [1] "2021-05-14"
## [1] "input_series=data$log_sold"
## [1] "2021-05-15"
## [1] "input_series=data$log_sold"
## [1] "2021-05-16"
## [1] "input_series=data$log_sold"
## [1] "2021-05-17"
## [1] "input_series=data$log_sold"
## [1] "2021-05-18"
## [1] "input_series=data$log_sold"
## [1] "2021-05-19"
## [1] "input_series=data$log_sold"
## [1] "2021-05-20"
## [1] "input_series=data$log_sold"
## [1] "2021-05-21"
## [1] "input_series=data$log_sold"
## [1] "2021-05-22"
## [1] "input_series=data$log_sold"
## [1] "2021-05-23"
## [1] "input_series=data$log_sold"
## [1] "2021-05-24"
## [1] "input_series=data$log_sold"
## [1] "2021-05-25"
## [1] "input_series=data$log_sold"
## [1] "2021-05-26"
## [1] "input_series=data$log_sold"
## [1] "2021-05-27"
## [1] "input_series=data$log_sold"
## [1] "2021-05-28"
## [1] "input_series=data$log_sold"
## [1] "2021-05-29"
## [1] "input_series=data$log_sold"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "log_sold"), 
    c("lm_prediction", "arima_prediction"))

We turned the results into melted results(long form).

accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(log_sold, value), by = list(variable)]

performance
##            variable  n     mean        sd         CV       FBias       MAPE
## 1:    lm_prediction 19 5.858585 0.3934767 0.06716242 -0.03175431 0.06462302
## 2: arima_prediction 19 5.858585 0.3934767 0.06716242 -0.05680289 0.07000923
##         RMSE       MAD       MADP      WMAPE
## 1: 0.7764512 0.3621781 0.06182007 0.06182007
## 2: 0.5073968 0.3865690 0.06598334 0.06598334

First view of data shows that lm prediction is better in terms of MAPE.However,let us see boxplot MAPE and FBias with respect to days of a week.

performance = melted_result[, accu(log_sold, value), by = list(event_date, 
    variable)]
performance[, `:=`(day_of_week, wday(event_date, label = T))]


ggplot(performance, aes(x = day_of_week, y = MAPE, fill = variable)) + 
    geom_boxplot()

ggplot(performance, aes(x = day_of_week, y = FBias, fill = variable)) + 
    geom_boxplot()

As we observe from table,MAPE values of lm prediction is less.They both tend to predict less.However,lm is better than arima in terms of FBias ,too.

So,we forecast with our lm model.Before that,we forecast our parameters by building arima model with auto arima function.

price_ts <- ts(maproducts$price, frequency = 7)
price_dec <- decompose(x = price_ts, type = "additive")
price_model = auto.arima(price_dec$random)
AIC(price_model)
## [1] 1966.841
price_model_forecast <- predict(price_model, n.ahead = 15)$pred
seasonality = price_dec$seasonal[1:1]
last_trend_value <- tail(price_dec$trend[!is.na(price_dec$trend)], 
    1)
price_model_forecast = price_model_forecast + last_trend_value + 
    seasonality
price_model_forecast
## Time Series:
## Start = c(54, 2) 
## End = c(56, 2) 
## Frequency = 7 
##  [1] 136.5906 137.0957 137.3132 137.3432 137.0187 136.9371 136.9831 136.9288
##  [9] 137.0708 137.1368 137.1460 137.0447 137.0193 137.0337 137.0167
model_fav <- auto.arima(maproducts$favored_count)
fav_fcast <- predict(model_fav, n.ahead = 15)$pred
model_lag_1 <- auto.arima(maproducts$lag_1)
lag1_fcast <- predict(model_lag_1, n.ahead = 15)$pred
model_lag_2 <- auto.arima(maproducts$lag_2)
lag2_fcast <- predict(model_lag_2, n.ahead = 15)$pred
model_cat <- auto.arima(maproducts$category_sold)
cat_ffcast <- predict(model_cat, n.ahead = 15)$pred
cat_fav_ts <- ts(maproducts$category_favored, frequency = 7)
cat_fav_dec <- decompose(x = cat_fav_ts, type = "additive")
cat_fav_model = auto.arima(cat_fav_dec$random)
AIC(cat_fav_model)
## [1] 6957.51
cat_fav_model_forecast <- predict(cat_fav_model, n.ahead = 15)$pred
brand_ts <- ts(maproducts$category_brand_sold, frequency = 7)
brand_dec <- decompose(x = brand_ts, type = "additive")
brand_model = auto.arima(brand_dec$random)
AIC(brand_model)
## [1] 6921.524
brand_model_forecast <- predict(brand_model, n.ahead = 15)$pred
outlier_ts <- ts(maproducts$outlier_great, frequency = 7)
outlier_dec <- decompose(x = outlier_ts, type = "additive")
outlier_model = auto.arima(outlier_dec$random)
AIC(outlier_model)
## [1] -292.537
outlier_model_forecast <- predict(outlier_model, n.ahead = 15)$pred
# outlier forecast0.005171854 trend 373 cat fav 24381.21 week
# 21
forecast_log <- predict(lm_model3_2, data.frame(month = as.factor(6), 
    price = price_model_forecast[1], trend = 373, favored_count = fav_fcast[1], 
    category_sold = cat_ffcast[1], category_brand_sold = brand_model_forecast[1], 
    lag_1 = lag1_fcast[1], lag_2 = lag2_fcast[1], category_favored = cat_fav_model_forecast[1], 
    outlier_great = outlier_model_forecast[1], weeks = 21))

Now,we forecast our value with our forecasted parameters.

fcast2 <- exp(forecast_log)


fcast2
##        1 
## 475.5609

##Fakir Dik Süpürge

After filtering our whole data for Fakir Dik Süpürge,we plot graph for amount that has been sold.

maproducts <- data[data$product_content_id == 7061886]
head(maproducts, 5)
##    event_date product_content_id    price sold_count visit_count basket_count
## 1: 2021-05-31            7061886 294.4167         24        1485           81
## 2: 2021-05-30            7061886 299.0000         11        1362           62
## 3: 2021-05-29            7061886 297.5714         14        1340           65
## 4: 2021-05-28            7061886 297.8889          9         972           62
## 5: 2021-05-27            7061886 299.0000          8         965           50
##    favored_count category_sold category_visits category_basket category_favored
## 1:           122           951           95645            4704             8886
## 2:           100           810           92899            3988             8050
## 3:           123           790           91368            4386             7491
## 4:            76            84             705           70347             2987
## 5:            71           101             767           72503             3399
##    category_brand_sold ty_visits
## 1:                 179 125439876
## 2:                 119 131821083
## 3:                 112 129670029
## 4:                6453 103514886
## 5:                6539 107391579
require(ggplot2)
ggplot(maproducts, aes(x = event_date, y = sold_count)) + geom_line()

That plot has wide variance, which urges us to take logarithm of sale data.Moreover, we add day,month ,trend data for further process.Then,we plot log of sold products over time to see if we decrease variance or not.

maproducts[, `:=`(month, month(event_date))]
maproducts[, `:=`(day, lubridate::wday(event_date))]

head(maproducts, 10)
##     event_date product_content_id    price sold_count visit_count basket_count
##  1: 2021-05-31            7061886 294.4167         24        1485           81
##  2: 2021-05-30            7061886 299.0000         11        1362           62
##  3: 2021-05-29            7061886 297.5714         14        1340           65
##  4: 2021-05-28            7061886 297.8889          9         972           62
##  5: 2021-05-27            7061886 299.0000          8         965           50
##  6: 2021-05-26            7061886 299.0000         12        1098           63
##  7: 2021-05-25            7061886 295.0038         16        1138           65
##  8: 2021-05-24            7061886 299.0000          7        1137           53
##  9: 2021-05-23            7061886 298.1765         17        1540           82
## 10: 2021-05-22            7061886 297.7273         11        1515           58
##     favored_count category_sold category_visits category_basket
##  1:           122           951           95645            4704
##  2:           100           810           92899            3988
##  3:           123           790           91368            4386
##  4:            76            84             705           70347
##  5:            71           101             767           72503
##  6:            71           104             785           71162
##  7:            93           112             803           71048
##  8:            98           106             795           78411
##  9:           132           143             835          100040
## 10:           120           122             887          103373
##     category_favored category_brand_sold ty_visits month day
##  1:             8886                 179 125439876     5   2
##  2:             8050                 119 131821083     5   1
##  3:             7491                 112 129670029     5   7
##  4:             2987                6453 103514886     5   6
##  5:             3399                6539 107391579     5   5
##  6:             3540                5162 106195988     5   4
##  7:             3258                5456 107004119     5   3
##  8:             3408                6025 108235639     5   2
##  9:             4032                8904 134993625     5   1
## 10:             4124                8167 133292217     5   7
maproducts[, `:=`(trend, 1:.N)]
maproducts[, `:=`(log_sold, log(sold_count))]
ggplot(maproducts, aes(x = event_date, y = log_sold)) + geom_line()

Variance of log of sold products seems more stable.Therefore,we start building linear regression models with logarithmic data.

lm_model <- lm(maproducts, formula = log_sold ~ month + day + 
    price + visit_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + day + price + visit_count, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21797 -0.28910  0.01365  0.25433  1.88324 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.282e+00  4.201e-01  14.953  < 2e-16 ***
## month        8.944e-02  9.893e-03   9.040  < 2e-16 ***
## day         -2.357e-02  1.413e-02  -1.668   0.0963 .  
## price       -1.351e-02  1.549e-03  -8.721  < 2e-16 ***
## visit_count  2.608e-04  3.363e-05   7.755 8.78e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5444 on 367 degrees of freedom
## Multiple R-squared:  0.3744, Adjusted R-squared:  0.3676 
## F-statistic: 54.91 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.28849 -0.29884  0.00666  0.26591  1.93000 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.196e+00  4.180e-01  14.824  < 2e-16 ***
## month        8.979e-02  9.914e-03   9.057  < 2e-16 ***
## price       -1.356e-02  1.552e-03  -8.733  < 2e-16 ***
## visit_count  2.637e-04  3.367e-05   7.831 5.21e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5458 on 368 degrees of freedom
## Multiple R-squared:  0.3696, Adjusted R-squared:  0.3645 
## F-statistic: 71.93 on 3 and 368 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.16167 -0.29608  0.03255  0.29471  1.77022 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.236e+01  1.022e+00  12.087  < 2e-16 ***
## month        7.415e-02  9.696e-03   7.648 1.81e-13 ***
## price       -3.259e-02  3.262e-03  -9.990  < 2e-16 ***
## visit_count  1.197e-04  3.877e-05   3.087  0.00218 ** 
## trend       -5.312e-03  8.126e-04  -6.537 2.10e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5172 on 367 degrees of freedom
## Multiple R-squared:  0.4354, Adjusted R-squared:  0.4292 
## F-statistic: 70.75 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.12389 -0.27978  0.01514  0.27842  1.83699 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.383e+01  1.057e+00  13.078  < 2e-16 ***
## month          6.497e-02  9.719e-03   6.684 8.68e-11 ***
## price         -3.789e-02  3.423e-03 -11.070  < 2e-16 ***
## visit_count    3.179e-05  4.315e-05   0.737    0.462    
## trend         -5.849e-03  8.042e-04  -7.273 2.15e-12 ***
## favored_count  1.375e-03  3.230e-04   4.258 2.62e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5056 on 366 degrees of freedom
## Multiple R-squared:  0.462,  Adjusted R-squared:  0.4547 
## F-statistic: 62.87 on 5 and 366 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.96936 -0.18329  0.05771  0.28790  0.78442 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.073e+01  9.076e-01  11.828  < 2e-16 ***
## month          4.209e-02  8.243e-03   5.106 5.31e-07 ***
## price         -2.854e-02  2.926e-03  -9.753  < 2e-16 ***
## visit_count   -3.011e-05  3.607e-05  -0.835    0.404    
## trend         -3.466e-03  6.912e-04  -5.015 8.31e-07 ***
## favored_count  1.528e-03  2.678e-04   5.706 2.39e-08 ***
## category_sold  2.145e-03  1.654e-04  12.971  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4188 on 365 degrees of freedom
## Multiple R-squared:  0.6318, Adjusted R-squared:  0.6257 
## F-statistic: 104.4 on 6 and 365 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + favored_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + favored_count, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.27022 -0.28029  0.01254  0.24740  1.99072 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.815e+00  4.636e-01  14.700  < 2e-16 ***
## month          8.422e-02  9.991e-03   8.430    8e-16 ***
## price         -1.603e-02  1.750e-03  -9.161  < 2e-16 ***
## visit_count    2.100e-04  3.795e-05   5.533    6e-08 ***
## favored_count  1.007e-03  3.408e-04   2.955  0.00333 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5401 on 367 degrees of freedom
## Multiple R-squared:  0.3843, Adjusted R-squared:  0.3776 
## F-statistic: 57.27 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    trend + favored_count)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + trend + favored_count, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.11449 -0.28832  0.02006  0.27857  1.83358 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   14.2076655  0.9202488  15.439  < 2e-16 ***
## month          0.0618618  0.0087523   7.068 7.97e-12 ***
## price         -0.0390015  0.0030733 -12.690  < 2e-16 ***
## trend         -0.0061856  0.0006616  -9.349  < 2e-16 ***
## favored_count  0.0014892  0.0002835   5.253 2.54e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5052 on 367 degrees of freedom
## Multiple R-squared:  0.4612, Adjusted R-squared:  0.4554 
## F-statistic: 78.55 on 4 and 367 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold + category_brand_sold)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold + category_brand_sold, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.96934 -0.20414  0.05615  0.29352  0.79521 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.087e+01  9.038e-01  12.031  < 2e-16 ***
## month                4.544e-02  8.313e-03   5.466 8.55e-08 ***
## price               -2.934e-02  2.928e-03 -10.022  < 2e-16 ***
## visit_count         -5.821e-05  3.776e-05  -1.542  0.12402    
## trend               -3.373e-03  6.880e-04  -4.903 1.42e-06 ***
## favored_count        1.082e-03  3.263e-04   3.314  0.00101 ** 
## category_sold        2.114e-03  1.648e-04  12.829  < 2e-16 ***
## category_brand_sold  2.482e-05  1.049e-05   2.367  0.01847 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4162 on 364 degrees of freedom
## Multiple R-squared:  0.6374, Adjusted R-squared:  0.6304 
## F-statistic: 91.39 on 7 and 364 DF,  p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    visit_count + trend + favored_count + category_sold + category_brand_sold)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend + 
##     favored_count + category_sold + category_brand_sold, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.96934 -0.20414  0.05615  0.29352  0.79521 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.087e+01  9.038e-01  12.031  < 2e-16 ***
## month                4.544e-02  8.313e-03   5.466 8.55e-08 ***
## price               -2.934e-02  2.928e-03 -10.022  < 2e-16 ***
## visit_count         -5.821e-05  3.776e-05  -1.542  0.12402    
## trend               -3.373e-03  6.880e-04  -4.903 1.42e-06 ***
## favored_count        1.082e-03  3.263e-04   3.314  0.00101 ** 
## category_sold        2.114e-03  1.648e-04  12.829  < 2e-16 ***
## category_brand_sold  2.482e-05  1.049e-05   2.367  0.01847 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4162 on 364 degrees of freedom
## Multiple R-squared:  0.6374, Adjusted R-squared:  0.6304 
## F-statistic: 91.39 on 7 and 364 DF,  p-value: < 2.2e-16
# that is the best
lm_model <- lm(maproducts, formula = log_sold ~ month + price + 
    trend + favored_count + category_sold + category_brand_sold)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ month + price + trend + favored_count + 
##     category_sold + category_brand_sold, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98687 -0.19292  0.05216  0.28444  0.82732 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.027e+01  8.165e-01  12.580  < 2e-16 ***
## month                5.013e-02  7.752e-03   6.466 3.23e-10 ***
## price               -2.752e-02  2.684e-03 -10.255  < 2e-16 ***
## trend               -2.882e-03  6.110e-04  -4.717 3.41e-06 ***
## favored_count        9.863e-04  3.210e-04   3.072  0.00228 ** 
## category_sold        2.089e-03  1.643e-04  12.714  < 2e-16 ***
## category_brand_sold  1.973e-05  9.973e-06   1.979  0.04859 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.417 on 365 degrees of freedom
## Multiple R-squared:  0.635,  Adjusted R-squared:  0.629 
## F-statistic: 105.8 on 6 and 365 DF,  p-value: < 2.2e-16

After evaluation of different models parameters of which is included in data, we choose model formula of which is ‘log_sold~month+price+trend+favored_count+category_sold+category_brand_sold’.

Furhermore,lag variables could violate our model.For that reason, we add lag1 and lag2 variables to model and examine their effectiveness.

maproducts[, `:=`(lag_1, NA)]
maproducts$lag_1[2:372] <- maproducts$log_sold[1:371]
maproducts[, `:=`(lag_2, NA)]
maproducts$lag_2[3:372] <- maproducts$log_sold[1:370]
lm_model <- lm(formula = log_sold ~ as.factor(month) + price + 
    trend + favored_count + category_sold + category_brand_sold, 
    data = maproducts)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + category_brand_sold, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47670 -0.15665  0.00544  0.19455  0.81009 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.264e+01  8.687e-01  14.547  < 2e-16 ***
## as.factor(month)2   -1.242e-01  8.085e-02  -1.536 0.125316    
## as.factor(month)3    1.719e-01  8.479e-02   2.028 0.043350 *  
## as.factor(month)4   -1.171e-01  9.691e-02  -1.209 0.227585    
## as.factor(month)5   -4.716e-02  8.215e-02  -0.574 0.566226    
## as.factor(month)6   -4.277e-01  1.260e-01  -3.394 0.000766 ***
## as.factor(month)7   -7.801e-01  1.126e-01  -6.927 2.03e-11 ***
## as.factor(month)8   -8.433e-01  1.136e-01  -7.425 8.46e-13 ***
## as.factor(month)9   -1.872e-01  1.147e-01  -1.633 0.103442    
## as.factor(month)10   3.438e-01  1.045e-01   3.291 0.001099 ** 
## as.factor(month)11   4.577e-01  1.037e-01   4.414 1.35e-05 ***
## as.factor(month)12   1.787e-01  8.417e-02   2.123 0.034453 *  
## price               -3.586e-02  2.779e-03 -12.903  < 2e-16 ***
## trend               -1.628e-03  6.307e-04  -2.581 0.010266 *  
## favored_count        8.743e-04  2.725e-04   3.209 0.001454 ** 
## category_sold        1.818e-03  1.317e-04  13.811  < 2e-16 ***
## category_brand_sold  3.862e-05  7.700e-06   5.016 8.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3067 on 355 degrees of freedom
## Multiple R-squared:  0.808,  Adjusted R-squared:  0.7993 
## F-statistic: 93.37 on 16 and 355 DF,  p-value: < 2.2e-16
lm_model <- lm(formula = log_sold ~ as.factor(month) + price + 
    trend + favored_count + category_sold + category_brand_sold + 
    lag_1, data = maproducts)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + category_brand_sold + lag_1, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.42419 -0.14600  0.00883  0.17404  0.73865 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.763e+00  9.480e-01  10.299  < 2e-16 ***
## as.factor(month)2   -9.232e-02  7.690e-02  -1.201 0.230742    
## as.factor(month)3    1.329e-01  8.067e-02   1.648 0.100227    
## as.factor(month)4   -1.076e-01  9.195e-02  -1.170 0.242845    
## as.factor(month)5   -3.904e-02  7.803e-02  -0.500 0.617120    
## as.factor(month)6   -3.353e-01  1.204e-01  -2.785 0.005641 ** 
## as.factor(month)7   -5.888e-01  1.113e-01  -5.292 2.13e-07 ***
## as.factor(month)8   -6.748e-01  1.113e-01  -6.065 3.40e-09 ***
## as.factor(month)9   -1.652e-01  1.089e-01  -1.518 0.130008    
## as.factor(month)10   2.422e-01  1.004e-01   2.413 0.016352 *  
## as.factor(month)11   3.022e-01  1.013e-01   2.983 0.003048 ** 
## as.factor(month)12   1.344e-01  8.017e-02   1.677 0.094435 .  
## price               -2.775e-02  2.941e-03  -9.436  < 2e-16 ***
## trend               -1.332e-03  6.017e-04  -2.213 0.027555 *  
## favored_count        7.305e-04  2.617e-04   2.791 0.005537 ** 
## category_sold        1.614e-03  1.412e-04  11.427  < 2e-16 ***
## category_brand_sold  2.612e-05  7.666e-06   3.407 0.000733 ***
## lag_1                2.306e-01  3.921e-02   5.881 9.47e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2909 on 353 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8281, Adjusted R-squared:  0.8198 
## F-statistic:   100 on 17 and 353 DF,  p-value: < 2.2e-16
lm_model <- lm(formula = log_sold ~ as.factor(month) + price + 
    trend + favored_count + category_sold + category_brand_sold + 
    lag_1 + lag_2, data = maproducts)
summary(lm_model)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + category_brand_sold + lag_1 + lag_2, data = maproducts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4022 -0.1343  0.0095  0.1755  0.7167 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.411e+00  9.285e-01  10.135  < 2e-16 ***
## as.factor(month)2   -9.267e-02  7.502e-02  -1.235   0.2175    
## as.factor(month)3    1.245e-01  7.881e-02   1.579   0.1152    
## as.factor(month)4   -1.070e-01  8.994e-02  -1.190   0.2349    
## as.factor(month)5   -1.745e-02  7.655e-02  -0.228   0.8198    
## as.factor(month)6   -3.005e-01  1.180e-01  -2.547   0.0113 *  
## as.factor(month)7   -5.614e-01  1.087e-01  -5.163 4.08e-07 ***
## as.factor(month)8   -6.576e-01  1.086e-01  -6.056 3.60e-09 ***
## as.factor(month)9   -1.625e-01  1.067e-01  -1.523   0.1288    
## as.factor(month)10   2.231e-01  9.927e-02   2.247   0.0252 *  
## as.factor(month)11   2.481e-01  1.012e-01   2.452   0.0147 *  
## as.factor(month)12   1.304e-01  7.830e-02   1.666   0.0967 .  
## price               -2.642e-02  2.884e-03  -9.162  < 2e-16 ***
## trend               -1.434e-03  5.887e-04  -2.436   0.0153 *  
## favored_count        8.169e-04  2.568e-04   3.181   0.0016 ** 
## category_sold        1.794e-03  1.436e-04  12.488  < 2e-16 ***
## category_brand_sold  1.638e-05  7.789e-06   2.102   0.0362 *  
## lag_1                2.120e-01  4.535e-02   4.673 4.23e-06 ***
## lag_2                2.640e-02  3.872e-02   0.682   0.4958    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2838 on 351 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8364, Adjusted R-squared:  0.8281 
## F-statistic: 99.72 on 18 and 351 DF,  p-value: < 2.2e-16

That modification gives us better model with respect to r-squared value.Our new best model has formula that is ‘log_sold ~ as.factor(month) + price + trend + favored_count + category_sold + category_brand_sold + lag_1 + lag_2’.

That is quite developing iteration,too.From the first view of logarithm of sold products,we may predict outliers that breaks our models’ goodness.Let us put them into model.

maproducts[, `:=`(residuals, NA)]
maproducts$residuals[2:372] <- lm_model$residuals
## Warning in maproducts$residuals[2:372] <- lm_model$residuals: number of items to
## replace is not a multiple of replacement length
maproducts[!is.na(residuals), `:=`(quant5, quantile(residuals, 
    0.05))]
maproducts[!is.na(residuals), `:=`(quant95, quantile(residuals, 
    0.95))]
maproducts[, `:=`(outlier_small, as.numeric(residuals < quant5))]
maproducts[, `:=`(outlier_great, as.numeric(residuals > quant95))]
lm_model2 <- lm(formula = log_sold ~ as.factor(month) + price + 
    trend + favored_count + category_sold + category_brand_sold + 
    lag_1 + lag_2 + outlier_small + outlier_great, data = maproducts)
summary(lm_model2)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + category_brand_sold + lag_1 + lag_2 + outlier_small + 
##     outlier_great, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24210 -0.14328  0.01365  0.17803  0.70603 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.203e+00  9.238e-01   9.963  < 2e-16 ***
## as.factor(month)2   -7.018e-02  7.481e-02  -0.938  0.34885    
## as.factor(month)3    1.379e-01  7.846e-02   1.758  0.07968 .  
## as.factor(month)4   -8.774e-02  8.952e-02  -0.980  0.32768    
## as.factor(month)5    1.394e-03  7.634e-02   0.018  0.98544    
## as.factor(month)6   -2.991e-01  1.170e-01  -2.556  0.01100 *  
## as.factor(month)7   -5.438e-01  1.081e-01  -5.033 7.75e-07 ***
## as.factor(month)8   -6.237e-01  1.083e-01  -5.757 1.88e-08 ***
## as.factor(month)9   -1.371e-01  1.062e-01  -1.290  0.19774    
## as.factor(month)10   2.311e-01  9.863e-02   2.343  0.01969 *  
## as.factor(month)11   2.564e-01  1.006e-01   2.549  0.01123 *  
## as.factor(month)12   1.436e-01  7.795e-02   1.843  0.06622 .  
## price               -2.576e-02  2.870e-03  -8.976  < 2e-16 ***
## trend               -1.250e-03  5.873e-04  -2.128  0.03408 *  
## favored_count        7.841e-04  2.550e-04   3.075  0.00227 ** 
## category_sold        1.810e-03  1.425e-04  12.696  < 2e-16 ***
## category_brand_sold  1.805e-05  7.762e-06   2.325  0.02064 *  
## lag_1                2.095e-01  4.498e-02   4.658 4.54e-06 ***
## lag_2                2.644e-02  3.840e-02   0.689  0.49156    
## outlier_small       -1.946e-01  7.018e-02  -2.773  0.00584 ** 
## outlier_great       -5.484e-02  6.806e-02  -0.806  0.42094    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2813 on 349 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8401, Adjusted R-squared:  0.831 
## F-statistic:  91.7 on 20 and 349 DF,  p-value: < 2.2e-16
lm_model2 <- lm(formula = log_sold ~ as.factor(month) + price + 
    trend + favored_count + category_sold + category_brand_sold + 
    lag_1 + lag_2 + outlier_small, data = maproducts)
summary(lm_model2)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + category_brand_sold + lag_1 + lag_2 + outlier_small, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24046 -0.14048  0.01292  0.17476  0.71152 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.235e+00  9.225e-01  10.012  < 2e-16 ***
## as.factor(month)2   -7.136e-02  7.476e-02  -0.955  0.34043    
## as.factor(month)3    1.408e-01  7.833e-02   1.798  0.07308 .  
## as.factor(month)4   -8.630e-02  8.946e-02  -0.965  0.33536    
## as.factor(month)5    3.586e-03  7.625e-02   0.047  0.96252    
## as.factor(month)6   -2.968e-01  1.169e-01  -2.539  0.01156 *  
## as.factor(month)7   -5.479e-01  1.079e-01  -5.079 6.18e-07 ***
## as.factor(month)8   -6.266e-01  1.082e-01  -5.790 1.57e-08 ***
## as.factor(month)9   -1.382e-01  1.062e-01  -1.302  0.19378    
## as.factor(month)10   2.348e-01  9.847e-02   2.384  0.01764 *  
## as.factor(month)11   2.606e-01  1.004e-01   2.595  0.00985 ** 
## as.factor(month)12   1.465e-01  7.782e-02   1.883  0.06051 .  
## price               -2.589e-02  2.864e-03  -9.039  < 2e-16 ***
## trend               -1.264e-03  5.868e-04  -2.154  0.03193 *  
## favored_count        7.809e-04  2.548e-04   3.065  0.00235 ** 
## category_sold        1.808e-03  1.425e-04  12.693  < 2e-16 ***
## category_brand_sold  1.830e-05  7.752e-06   2.361  0.01876 *  
## lag_1                2.102e-01  4.495e-02   4.676 4.19e-06 ***
## lag_2                2.580e-02  3.837e-02   0.672  0.50175    
## outlier_small       -1.902e-01  6.993e-02  -2.720  0.00686 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2812 on 350 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8398, Adjusted R-squared:  0.8311 
## F-statistic: 96.58 on 19 and 350 DF,  p-value: < 2.2e-16

As we expected, outliers help our model to predict better. In this model,we can add week variable to model to examine its effectiveness in the model.

maproducts[, `:=`(weeks, week(event_date))]

head(maproducts)
##    event_date product_content_id    price sold_count visit_count basket_count
## 1: 2021-05-31            7061886 294.4167         24        1485           81
## 2: 2021-05-30            7061886 299.0000         11        1362           62
## 3: 2021-05-29            7061886 297.5714         14        1340           65
## 4: 2021-05-28            7061886 297.8889          9         972           62
## 5: 2021-05-27            7061886 299.0000          8         965           50
## 6: 2021-05-26            7061886 299.0000         12        1098           63
##    favored_count category_sold category_visits category_basket category_favored
## 1:           122           951           95645            4704             8886
## 2:           100           810           92899            3988             8050
## 3:           123           790           91368            4386             7491
## 4:            76            84             705           70347             2987
## 5:            71           101             767           72503             3399
## 6:            71           104             785           71162             3540
##    category_brand_sold ty_visits month day trend log_sold    lag_1    lag_2
## 1:                 179 125439876     5   2     1 3.178054       NA       NA
## 2:                 119 131821083     5   1     2 2.397895 3.178054       NA
## 3:                 112 129670029     5   7     3 2.639057 2.397895 3.178054
## 4:                6453 103514886     5   6     4 2.197225 2.639057 2.397895
## 5:                6539 107391579     5   5     5 2.079442 2.197225 2.639057
## 6:                5162 106195988     5   4     6 2.484907 2.079442 2.197225
##     residuals     quant5   quant95 outlier_small outlier_great weeks
## 1:         NA         NA        NA            NA            NA    22
## 2: -0.9995279 -0.4841688 0.3882731             1             0    22
## 3: -0.2609915 -0.4841688 0.3882731             0             0    22
## 4: -0.2885245 -0.4841688 0.3882731             0             0    22
## 5:  0.1721715 -0.4841688 0.3882731             0             0    21
## 6:  0.2357392 -0.4841688 0.3882731             0             0    21
lm_model3 <- lm(formula = log_sold ~ as.factor(month) + price + 
    trend + favored_count + category_sold + category_brand_sold + 
    lag_1 + lag_2 + outlier_small + weeks, data = maproducts)
summary(lm_model3)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + category_brand_sold + lag_1 + lag_2 + outlier_small + 
##     weeks, data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24886 -0.14436  0.00757  0.17470  0.70989 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.266e+00  9.295e-01   9.969  < 2e-16 ***
## as.factor(month)2   -5.729e-02  8.865e-02  -0.646  0.51853    
## as.factor(month)3    1.697e-01  1.252e-01   1.355  0.17614    
## as.factor(month)4   -4.210e-02  1.740e-01  -0.242  0.80900    
## as.factor(month)5    6.359e-02  2.165e-01   0.294  0.76916    
## as.factor(month)6   -2.230e-01  2.755e-01  -0.809  0.41884    
## as.factor(month)7   -4.601e-01  3.157e-01  -1.457  0.14596    
## as.factor(month)8   -5.237e-01  3.640e-01  -1.439  0.15109    
## as.factor(month)9   -1.941e-02  4.150e-01  -0.047  0.96273    
## as.factor(month)10   3.693e-01  4.646e-01   0.795  0.42725    
## as.factor(month)11   4.102e-01  5.151e-01   0.796  0.42642    
## as.factor(month)12   3.126e-01  5.660e-01   0.552  0.58110    
## price               -2.594e-02  2.872e-03  -9.030  < 2e-16 ***
## trend               -1.275e-03  5.887e-04  -2.165  0.03105 *  
## favored_count        7.793e-04  2.552e-04   3.053  0.00244 ** 
## category_sold        1.809e-03  1.427e-04  12.680  < 2e-16 ***
## category_brand_sold  1.811e-05  7.790e-06   2.324  0.02069 *  
## lag_1                2.091e-01  4.514e-02   4.633 5.09e-06 ***
## lag_2                2.515e-02  3.849e-02   0.653  0.51391    
## outlier_small       -1.896e-01  7.005e-02  -2.707  0.00713 ** 
## weeks               -3.450e-03  1.165e-02  -0.296  0.76728    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2816 on 349 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8399, Adjusted R-squared:  0.8307 
## F-statistic: 91.52 on 20 and 349 DF,  p-value: < 2.2e-16
# take that of
lm_model2_3 <- lm(formula = log_sold ~ as.factor(month) + price + 
    trend + favored_count + category_sold + category_brand_sold + 
    lag_1 + lag_2 + outlier_small, data = maproducts)
summary(lm_model2_3)
## 
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count + 
##     category_sold + category_brand_sold + lag_1 + lag_2 + outlier_small, 
##     data = maproducts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24046 -0.14048  0.01292  0.17476  0.71152 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.235e+00  9.225e-01  10.012  < 2e-16 ***
## as.factor(month)2   -7.136e-02  7.476e-02  -0.955  0.34043    
## as.factor(month)3    1.408e-01  7.833e-02   1.798  0.07308 .  
## as.factor(month)4   -8.630e-02  8.946e-02  -0.965  0.33536    
## as.factor(month)5    3.586e-03  7.625e-02   0.047  0.96252    
## as.factor(month)6   -2.968e-01  1.169e-01  -2.539  0.01156 *  
## as.factor(month)7   -5.479e-01  1.079e-01  -5.079 6.18e-07 ***
## as.factor(month)8   -6.266e-01  1.082e-01  -5.790 1.57e-08 ***
## as.factor(month)9   -1.382e-01  1.062e-01  -1.302  0.19378    
## as.factor(month)10   2.348e-01  9.847e-02   2.384  0.01764 *  
## as.factor(month)11   2.606e-01  1.004e-01   2.595  0.00985 ** 
## as.factor(month)12   1.465e-01  7.782e-02   1.883  0.06051 .  
## price               -2.589e-02  2.864e-03  -9.039  < 2e-16 ***
## trend               -1.264e-03  5.868e-04  -2.154  0.03193 *  
## favored_count        7.809e-04  2.548e-04   3.065  0.00235 ** 
## category_sold        1.808e-03  1.425e-04  12.693  < 2e-16 ***
## category_brand_sold  1.830e-05  7.752e-06   2.361  0.01876 *  
## lag_1                2.102e-01  4.495e-02   4.676 4.19e-06 ***
## lag_2                2.580e-02  3.837e-02   0.672  0.50175    
## outlier_small       -1.902e-01  6.993e-02  -2.720  0.00686 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2812 on 350 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8398, Adjusted R-squared:  0.8311 
## F-statistic: 96.58 on 19 and 350 DF,  p-value: < 2.2e-16

For another perspective AIC values of models.

require(ursa)
AIC(lm_model)
## [1] 138.3645
AIC(lm_model2_3)
## [1] 132.6254
AIC(lm_model3)
## [1] 134.5325

That proves claim of model with month as factor variable.

Now,It is time for us to come up with arima models.Firstly,we build time series with frequency=7.Plot that.

# time series and arima
ts_sold <- ts(maproducts$log_sold, frequency = 7)
ts.plot(ts_sold)

Box.test(ts_sold, lag = 7, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_sold
## X-squared = 856.91, df = 7, p-value < 2.2e-16

Variance is stable enough thanks to taking logarithm of data.Then, decompose time series data with additive type. Plot decomposed data

sold_decomp <- decompose(ts_sold, type = "additive")
plot(sold_decomp)

Random part is kind of stationary except some outlier points. We shall observe autocorrelation and partial autocorrelation of random part.

random = sold_decomp$random
# random part looks really fine.
acf(sold_decomp$random, na.action = na.pass)

pacf(sold_decomp$random, na.action = na.pass)

As we observe from those graphs,we start building arima models with (p,d,q)=(2,0,0) and increase p value to 10 one by one.After this process,we find model with auto arima function. Pick the best model.

ar_model <- arima(random, order = c(2, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(2, 0, 0))
## 
## Coefficients:
##          ar1      ar2  intercept
##       0.1652  -0.1504    -0.0003
## s.e.  0.0517   0.0517     0.0153
## 
## sigma^2 estimated as 0.08328:  log likelihood = -64.51,  aic = 137.02
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -9.192964e-05 0.2885835 0.2252879 140.0453 164.2215 0.7462189
##                     ACF1
## Training set -0.04263476
ar_model <- arima(random, order = c(3, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(3, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3  intercept
##       0.1223  -0.1031  -0.2901    -0.0003
## s.e.  0.0500   0.0501   0.0503     0.0114
## 
## sigma^2 estimated as 0.07628:  log likelihood = -48.58,  aic = 107.17
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE     MASE
## Training set -0.0004172213 0.2761956 0.2117842 73.38254 196.0252 0.701491
##                     ACF1
## Training set -0.03808026
ar_model <- arima(random, order = c(4, 0, 0))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(4, 0, 0))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4  intercept
##       0.0837  -0.1163  -0.2748  -0.1341     -4e-04
## s.e.  0.0517   0.0499   0.0502   0.0521      1e-02
## 
## sigma^2 estimated as 0.07491:  log likelihood = -45.3,  aic = 102.6
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.0004704353 0.2736996 0.2094488 67.30758 183.3832 0.6937554
##                     ACF1
## Training set -0.03024761
ar_model <- arima(random, order = c(0, 0, 1))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(0, 0, 1))
## 
## Coefficients:
##          ma1  intercept
##       0.1664    -0.0004
## s.e.  0.0510     0.0178
## 
## sigma^2 estimated as 0.08482:  log likelihood = -67.85,  aic = 141.7
## 
## Training set error measures:
##                        ME      RMSE       MAE      MPE    MAPE      MASE
## Training set 4.311051e-05 0.2912457 0.2276016 106.9205 132.293 0.7538828
##                      ACF1
## Training set -0.005386586
ar_model <- arima(random, order = c(0, 0, 2))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(0, 0, 2))
## 
## Coefficients:
##          ma1      ma2  intercept
##       0.1029  -0.0909    -0.0003
## s.e.  0.0871   0.0971     0.0154
## 
## sigma^2 estimated as 0.08458:  log likelihood = -67.33,  aic = 142.66
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -4.257592e-05 0.2908307 0.2280883 129.2621 144.3999 0.7554947
##                    ACF1
## Training set 0.02991958
# THAT IS THE ONE
ar_model9 <- arima(random, order = c(0, 0, 3))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(0, 0, 2))
## 
## Coefficients:
##          ma1      ma2  intercept
##       0.1029  -0.0909    -0.0003
## s.e.  0.0871   0.0971     0.0154
## 
## sigma^2 estimated as 0.08458:  log likelihood = -67.33,  aic = 142.66
## 
## Training set error measures:
##                         ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -4.257592e-05 0.2908307 0.2280883 129.2621 144.3999 0.7554947
##                    ACF1
## Training set 0.02991958
ar_model <- arima(random, order = c(0, 0, 4))
summary(ar_model)
## 
## Call:
## arima(x = random, order = c(0, 0, 4))
## 
## Coefficients:
##           ma1      ma2      ma3      ma4  intercept
##       -0.1232  -0.3761  -0.4221  -0.0786      1e-04
## s.e.   0.0513   0.0508   0.0451   0.0553      3e-04
## 
## sigma^2 estimated as 0.06204:  log likelihood = -13.09,  aic = 38.18
## 
## Training set error measures:
##                        ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.000978949 0.2490877 0.1922267 84.52326 221.7878 0.6367108
##                     ACF1
## Training set 0.004184797

Best model we have above is the model with (p,d,q)=(0,0,3).

To compare lm and arima models, we should compare them for test period which is from 2021-05-11 to 2021-05-29.

train_start = as.Date("2020-05-25")
test_start = as.Date("2021-05-11")
test_end = as.Date("2021-05-29")

test_dates = seq(test_start, test_end, by = "day")
test_dates
##  [1] "2021-05-11" "2021-05-12" "2021-05-13" "2021-05-14" "2021-05-15"
##  [6] "2021-05-16" "2021-05-17" "2021-05-18" "2021-05-19" "2021-05-20"
## [11] "2021-05-21" "2021-05-22" "2021-05-23" "2021-05-24" "2021-05-25"
## [16] "2021-05-26" "2021-05-27" "2021-05-28" "2021-05-29"
# forecast with lm model
forecast_with_lr = function(fmla, data, forecast_data) {
    fitted_lm = lm(as.formula(fmla), data)
    forecasted = predict(fitted_lm, forecast_data)
    return(list(forecast = as.numeric(forecasted), model = fitted_lm))
}

# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "log_sold", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(9, 0, 0))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}

We define our functions with respect to parameters we found above.Then,we forecast for test dates with our lm and arima models separately.

# loop over the test dates
forecast_ahead = 1


results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = maproducts[event_date <= current_date]
    forecast_data = maproducts[event_date == test_dates[i]]

    # first lm models
    fmla = "log_sold ~ as.factor(month) + price + category_sold + 
    lag_1 + lag_2 + outlier_great"
    forecasted = forecast_with_lr(fmla, past_data, forecast_data)
    forecast_data[, `:=`(lm_prediction, forecasted$forecast)]

    # arima model with auto.arima
    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "log_sold", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$log_sold"
## [1] "2021-05-12"
## [1] "input_series=data$log_sold"
## [1] "2021-05-13"
## [1] "input_series=data$log_sold"
## [1] "2021-05-14"
## [1] "input_series=data$log_sold"
## [1] "2021-05-15"
## [1] "input_series=data$log_sold"
## [1] "2021-05-16"
## [1] "input_series=data$log_sold"
## [1] "2021-05-17"
## [1] "input_series=data$log_sold"
## [1] "2021-05-18"
## [1] "input_series=data$log_sold"
## [1] "2021-05-19"
## [1] "input_series=data$log_sold"
## [1] "2021-05-20"
## [1] "input_series=data$log_sold"
## [1] "2021-05-21"
## [1] "input_series=data$log_sold"
## [1] "2021-05-22"
## [1] "input_series=data$log_sold"
## [1] "2021-05-23"
## [1] "input_series=data$log_sold"
## [1] "2021-05-24"
## [1] "input_series=data$log_sold"
## [1] "2021-05-25"
## [1] "input_series=data$log_sold"
## [1] "2021-05-26"
## [1] "input_series=data$log_sold"
## [1] "2021-05-27"
## [1] "input_series=data$log_sold"
## [1] "2021-05-28"
## [1] "input_series=data$log_sold"
## [1] "2021-05-29"
## [1] "input_series=data$log_sold"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "log_sold"), 
    c("lm_prediction", "arima_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date log_sold         variable    value
##  1: 2021-05-11 3.433987    lm_prediction 3.026657
##  2: 2021-05-12 2.995732    lm_prediction 3.036444
##  3: 2021-05-13 2.772589    lm_prediction 3.112824
##  4: 2021-05-14 3.555348    lm_prediction 3.113620
##  5: 2021-05-15 3.135494    lm_prediction 3.040228
##  6: 2021-05-16 3.258097    lm_prediction 3.071004
##  7: 2021-05-17 3.091042    lm_prediction 2.953569
##  8: 2021-05-18 2.944439    lm_prediction 2.989225
##  9: 2021-05-19 2.995732    lm_prediction 2.900105
## 10: 2021-05-20 2.833213    lm_prediction 2.651231
## 11: 2021-05-21 2.484907    lm_prediction 2.707107
## 12: 2021-05-22 2.397895    lm_prediction 2.711366
## 13: 2021-05-23 2.833213    lm_prediction 2.556579
## 14: 2021-05-24 1.945910    lm_prediction 2.662905
## 15: 2021-05-25 2.772589    lm_prediction 2.607640
## 16: 2021-05-26 2.484907    lm_prediction 2.446789
## 17: 2021-05-27 2.079442    lm_prediction 2.497629
## 18: 2021-05-28 2.197225    lm_prediction 2.549871
## 19: 2021-05-29 2.639057    lm_prediction 4.101142
## 20: 2021-05-11 3.433987 arima_prediction 3.753423
## 21: 2021-05-12 2.995732 arima_prediction 3.752491
## 22: 2021-05-13 2.772589 arima_prediction 3.750498
## 23: 2021-05-14 3.555348 arima_prediction 3.749736
## 24: 2021-05-15 3.135494 arima_prediction 3.751519
## 25: 2021-05-16 3.258097 arima_prediction 3.748914
## 26: 2021-05-17 3.091042 arima_prediction 3.748967
## 27: 2021-05-18 2.944439 arima_prediction 3.748417
## 28: 2021-05-19 2.995732 arima_prediction 3.748331
## 29: 2021-05-20 2.833213 arima_prediction 3.747522
## 30: 2021-05-21 2.484907 arima_prediction 3.745967
## 31: 2021-05-22 2.397895 arima_prediction 3.742094
## 32: 2021-05-23 2.833213 arima_prediction 3.742548
## 33: 2021-05-24 1.945910 arima_prediction 3.743295
## 34: 2021-05-25 2.772589 arima_prediction 3.743557
## 35: 2021-05-26 2.484907 arima_prediction 3.743530
## 36: 2021-05-27 2.079442 arima_prediction 3.741925
## 37: 2021-05-28 2.197225 arima_prediction 3.743136
## 38: 2021-05-29 2.639057 arima_prediction 3.743903
##     event_date log_sold         variable    value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(log_sold, value), by = list(variable)]

performance
##            variable  n     mean        sd        CV       FBias      MAPE
## 1:    lm_prediction 19 2.781622 0.4392261 0.1579029 -0.03566864 0.1210269
## 2: arima_prediction 19 2.781622 0.4392261 0.1579029 -0.34699471 0.3816355
##         RMSE       MAD      MADP     WMAPE
## 1: 0.4465979 0.3125010 0.1123449 0.1123449
## 2: 1.0544824 0.9652081 0.3469947 0.3469947

First view of data shows that lm prediction is better in terms of MAPE.However,let us see boxplot MAPE and FBias with respect to days of a week.

performance = melted_result[, accu(log_sold, value), by = list(event_date, 
    variable)]
performance[, `:=`(day_of_week, wday(event_date, label = T))]


ggplot(performance, aes(x = day_of_week, y = MAPE, fill = variable)) + 
    geom_boxplot()

ggplot(performance, aes(x = day_of_week, y = FBias, fill = variable)) + 
    geom_boxplot()

As we observe from table,MAPE values of lm prediction is less.They both tend to predict less.However,lm is better than arima in terms of FBias ,too.

So,we forecast with our lm model.Before that,we forecast our parameters by building arima model with auto arima function.

price_ts <- ts(maproducts$price, frequency = 7)
price_dec <- decompose(x = price_ts, type = "additive")
price_model = auto.arima(price_dec$random)
AIC(price_model)
## [1] 1812.927
price_model_forecast <- predict(price_model, n.ahead = 15)$pred
seasonality = price_dec$seasonal[1:1]
last_trend_value <- tail(price_dec$trend[!is.na(price_dec$trend)], 
    1)
price_model_forecast = price_model_forecast + last_trend_value + 
    seasonality
price_model_forecast
## Time Series:
## Start = c(54, 2) 
## End = c(56, 2) 
## Frequency = 7 
##  [1] 235.8121 235.7070 235.8431 236.0378 235.9613 235.9003 235.9003 235.9003
##  [9] 235.9003 235.9003 235.9003 235.9003 235.9003 235.9003 235.9003
model_cat <- auto.arima(maproducts$category_sold)
model_fav <- auto.arima(maproducts$favored_count)
fav_fcast <- predict(model_fav, n.ahead = 15)$pred
cat_fcast <- predict(model_cat, n.ahead = 15)$pred
model_lag_1 <- auto.arima(maproducts$lag_1)
lag1_fcast <- predict(model_lag_1, n.ahead = 15)$pred
model_lag_2 <- auto.arima(maproducts$lag_2)
lag2_fcast <- predict(model_lag_2, n.ahead = 15)$pred
model_brand <- auto.arima(maproducts$category_brand_sold)
brand_fcast <- predict(model_brand, n.ahead = 15)$pred
model_out <- auto.arima(maproducts$outlier_small)
small_fcast <- predict(model_out, n.ahead = 15)$pred

Now,we forecast our value with our forecasted parameters.

forecast_Log <- predict(lm_model2_3, data.frame(month = as.factor(6), 
    price = price_model_forecast[1], trend = 373, favored_count = fav_fcast[1], 
    category_sold = cat_fcast[1], category_brand_sold = brand_fcast[1], 
    lag_1 = lag1_fcast[1], lag_2 = lag2_fcast[1], outlier_small = small_fcast[1]))


fcast3 <- exp(forecast_Log)
fcast3
##        1 
## 29.69554
data_path = "/Users/onurcanaydin/Desktop/360 proje/ProjectRawData.csv"
daily_data = fread(data_path)
str(daily_data)
## Classes 'data.table' and 'data.frame':   4331 obs. of  13 variables:
##  $ event_date         : IDate, format: "2021-05-31" "2021-05-31" ...
##  $ product_content_id : int  85004 4066298 6676673 7061886 31515569 32737302 32939029 48740784 73318567 85004 ...
##  $ price              : num  87.4 65.8 120.7 294.4 60.3 ...
##  $ sold_count         : int  80 1398 345 24 454 44 125 2 175 85 ...
##  $ visit_count        : int  6002 19102 19578 1485 20114 5017 5301 221 20256 5265 ...
##  $ basket_count       : int  744 5096 1261 81 2871 316 608 15 965 633 ...
##  $ favored_count      : int  1642 1703 1510 122 2102 607 698 22 1960 1104 ...
##  $ category_sold      : int  5048 6547 4944 951 8155 5198 930 1684 5198 4117 ...
##  $ category_visits    : int  236197 108811 306462 95645 637143 1010634 41973 329087 1010634 215260 ...
##  $ category_basket    : int  33681 28558 23418 4704 49389 33728 3911 12614 33728 25181 ...
##  $ category_favored   : int  40472 11913 26597 8886 62460 96699 5791 24534 96699 36225 ...
##  $ category_brand_sold: int  743 4286 786 179 1759 3665 875 12 3665 430 ...
##  $ ty_visits          : int  125439876 125439876 125439876 125439876 125439876 125439876 125439876 125439876 125439876 131821083 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Product 1

When we look at the sales graph of product 1(‘mont’) , we see that there is no sales in some periods.At first,I thought the reason for this was that the store in trendyol had removed product from the aisle.However,then when I saw that the product was added to the basket and favorites during these non-sale periods, I realized that the reason for this was seasonality.

daily_data$event_date <- as.Date(daily_data$event_date, "%d.%m.%Y")
product1_data <- daily_data[product_content_id == 48740784]
ggplot(product1_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

If we see the sales period more closely:

ggplot(product1_data[as.Date(event_date) <= "2021-01-01" & as.Date(event_date) >= 
    "2020-10-01"], aes(x = event_date, y = sold_count)) + geom_line(colour = "firebrick2", 
    size = 1.5) + theme_dark()

We established both arima and linear regression models for the products.Then we compared these forecasting models and chose the best one.At first, we built a linear regression model using all predictors for product1.

product1_reg <- lm(sold_count ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_visits + category_basket + 
    category_favored + category_brand_sold + ty_visits, product1_data)
summary(product1_reg)
## 
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_visits + category_basket + 
##     category_favored + category_brand_sold + ty_visits, data = product1_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5381 -1.3319 -0.4026  1.2313  7.7383 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.320e+00  2.276e+00   2.337   0.0229 *  
## price               -3.457e-03  2.818e-03  -1.227   0.2250    
## visit_count         -1.698e-03  1.968e-02  -0.086   0.9315    
## basket_count         1.990e-01  1.006e-02  19.787  < 2e-16 ***
## favored_count       -5.923e-02  1.714e-01  -0.346   0.7309    
## category_sold        1.232e-03  1.075e-03   1.146   0.2564    
## category_visits     -2.694e-06  1.126e-05  -0.239   0.8118    
## category_basket      1.399e-06  1.506e-06   0.929   0.3568    
## category_favored    -6.610e-05  1.193e-05  -5.540 7.72e-07 ***
## category_brand_sold  9.788e-06  8.014e-06   1.221   0.2269    
## ty_visits           -2.134e-08  1.171e-08  -1.822   0.0736 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.611 on 58 degrees of freedom
##   (303 observations deleted due to missingness)
## Multiple R-squared:  0.8873, Adjusted R-squared:  0.8679 
## F-statistic: 45.66 on 10 and 58 DF,  p-value: < 2.2e-16

We’ve seen that basket_count and category_favored are effective predictors.The adjusted R-squared value was also high.The p value of the f test is fine.

product1_reg <- lm(sold_count ~ price + basket_count + category_sold + 
    category_basket + category_favored + category_brand_sold + 
    ty_visits, product1_data)
summary(product1_reg)
## 
## Call:
## lm(formula = sold_count ~ price + basket_count + category_sold + 
##     category_basket + category_favored + category_brand_sold + 
##     ty_visits, data = product1_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4637 -1.3676 -0.4079  1.2025  7.6485 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.207e+00  2.213e+00   2.353   0.0219 *  
## price               -3.432e-03  2.745e-03  -1.250   0.2160    
## basket_count         1.981e-01  9.856e-03  20.102  < 2e-16 ***
## category_sold        6.619e-04  6.786e-04   0.975   0.3332    
## category_basket      9.669e-07  9.100e-07   1.062   0.2922    
## category_favored    -6.118e-05  1.041e-05  -5.878 1.87e-07 ***
## category_brand_sold  7.548e-06  6.904e-06   1.093   0.2786    
## ty_visits           -2.451e-08  1.021e-08  -2.401   0.0194 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.57 on 61 degrees of freedom
##   (303 observations deleted due to missingness)
## Multiple R-squared:  0.8852, Adjusted R-squared:  0.8721 
## F-statistic: 67.21 on 7 and 61 DF,  p-value: < 2.2e-16

We further increased the adjusted R-squared value by removing redundant predictors. We found the forecast values in the train period with the linear model we built and we printed the forecast values over the actual values on the chart.

product1_data[, `:=`(forecast1, predict(product1_reg, product1_data))]
ggplot(product1_data, aes(x = event_date)) + geom_line(aes(y = sold_count, 
    color = "real"), size = 1.5) + geom_line(aes(y = forecast1, 
    color = "predicted"), size = 1.5) + theme_dark()
## Warning: Removed 127 row(s) containing missing values (geom_path).

We added a daytype column to the product 1 data table to find out if there is daily seasonality in the sales data.The daytype variable returns from 1 to 7 for days.We also added daytype to the linear model.

day_type <- read_excel("/Users/onurcanaydin/Downloads/daytype.xlsx", 
    range = "C1:C372", col_names = FALSE)
## New names:
## * `` -> ...1
product1_data <- cbind(product1_data, day_type)
names(product1_data)[15] <- "day_type"

product1_reg <- lm(sold_count ~ price + basket_count + category_sold + 
    category_basket + category_favored + category_brand_sold + 
    ty_visits + as.factor(day_type), product1_data)
summary(product1_reg)
## 
## Call:
## lm(formula = sold_count ~ price + basket_count + category_sold + 
##     category_basket + category_favored + category_brand_sold + 
##     ty_visits + as.factor(day_type), data = product1_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9503 -1.0827 -0.3769  1.2563  6.7710 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.247e+00  2.317e+00   2.265   0.0275 *  
## price                -2.925e-03  2.823e-03  -1.036   0.3046    
## basket_count          1.964e-01  1.019e-02  19.277  < 2e-16 ***
## category_sold         5.971e-04  6.931e-04   0.862   0.3927    
## category_basket       1.092e-06  9.473e-07   1.153   0.2540    
## category_favored     -5.925e-05  1.063e-05  -5.572 7.83e-07 ***
## category_brand_sold   8.041e-06  7.055e-06   1.140   0.2593    
## ty_visits            -2.212e-08  1.040e-08  -2.126   0.0380 *  
## as.factor(day_type)2 -1.768e+00  1.236e+00  -1.430   0.1584    
## as.factor(day_type)3 -1.196e+00  1.290e+00  -0.927   0.3578    
## as.factor(day_type)4 -7.771e-01  1.157e+00  -0.672   0.5045    
## as.factor(day_type)5  4.665e-01  1.222e+00   0.382   0.7040    
## as.factor(day_type)6  3.529e-01  1.193e+00   0.296   0.7684    
## as.factor(day_type)7 -1.021e+00  1.218e+00  -0.838   0.4056    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.569 on 55 degrees of freedom
##   (303 observations deleted due to missingness)
## Multiple R-squared:  0.8965, Adjusted R-squared:  0.8721 
## F-statistic: 36.66 on 13 and 55 DF,  p-value: < 2.2e-16
AIC(product1_reg)
## [1] 340.3843

The daytype regressor had no significant effect on the model.Therefore, we removed the back daytype from the model.

product1_reg <- lm(sold_count ~ price + basket_count + category_basket + 
    category_favored + ty_visits, product1_data)
summary(product1_reg)
## 
## Call:
## lm(formula = sold_count ~ price + basket_count + category_basket + 
##     category_favored + ty_visits, data = product1_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.675 -1.334 -0.380  1.095  7.449 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.988e+00  2.088e+00   2.868  0.00561 ** 
## price            -4.417e-03  2.625e-03  -1.683  0.09742 .  
## basket_count      1.945e-01  9.330e-03  20.841  < 2e-16 ***
## category_basket   1.259e-06  7.434e-07   1.693  0.09532 .  
## category_favored -5.345e-05  7.502e-06  -7.124 1.21e-09 ***
## ty_visits        -2.585e-08  9.533e-09  -2.711  0.00863 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.565 on 63 degrees of freedom
##   (303 observations deleted due to missingness)
## Multiple R-squared:  0.8819, Adjusted R-squared:  0.8725 
## F-statistic:  94.1 on 5 and 63 DF,  p-value: < 2.2e-16
AIC(product1_reg)
## [1] 333.5134

When we look at the graph, the forecast and the actual values overlap in general, only the forecast graph was cut in some periods, this is because the value of the price regressor in our linear regression model was not given in some periods in the excel data that given to us.

product1_data[, `:=`(forecast2, predict(product1_reg, product1_data))]
ggplot(product1_data, aes(x = event_date)) + geom_line(aes(y = sold_count, 
    color = "real"), size = 1.5) + geom_line(aes(y = forecast2, 
    color = "predicted"), size = 1.5) + theme_dark()
## Warning: Removed 127 row(s) containing missing values (geom_path).

If we see the sales period more closely:

ggplot(product1_data[as.Date(event_date) <= "2021-01-01" & as.Date(event_date) >= 
    "2020-10-01"], aes(x = event_date)) + geom_line(aes(y = sold_count, 
    color = "real"), size = 1.5) + geom_line(aes(y = forecast2, 
    color = "predicted"), size = 1.5) + theme_dark()
## Warning: Removed 17 row(s) containing missing values (geom_path).

ARIMA model for product1

We did decomposition at daily level for product1.Then, we build arima model with auto arima function.

datats <- ts(product1_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
ts_decomposed <- decompose(x = datats, type = "additive")
model = auto.arima(ts_decomposed$random, max.p = 2, max.q = 2)
AIC(model)
## [1] 12491.09

Forecasting for product1

We compared the aic values of the arima model and the linear model and chose the linear model.In order to be able to forecast with linear model in the test period, we must also forecast the value of regressors by constructing arima models.

ARIMA models for predictors of Linear Model of Product1

We build arima models by using past data for forecasting predictors or regressors of linear model of product 1.

ARIMA model for Basket_count

we build arima model with decomposing at daily level for basket_count regressor and forecast the value of basket_count on the desired day in competition period.

ggplot(product1_data, aes(x = event_date, y = basket_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

ggplot(product1_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >= 
    "2021-05-01"], aes(x = event_date, y = basket_count)) + geom_line(colour = "firebrick2", 
    size = 1.5) + theme_dark()

basket_count_ts <- ts(product1_data$basket_count, start = as.Date("2021-05-04"), 
    end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 829.9022
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 16)$pred
seasonality = basket_count_dec$seasonal[1:16]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)], 
    1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value + 
    seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 13.516082 16.281420 13.170164  9.096985  7.575588 10.531312 11.747101
##  [8] 12.152816 11.341370 11.304996 10.528811 10.424621 11.352183 10.750536
## [15] 10.853123 11.315081

ARIMA model for Category_basket

we build arima model with decomposing at daily level for category_basket regressor and forecast the value of category_basket on the desired day in the competition period.

ggplot(product1_data, aes(x = event_date, y = category_basket)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

category_basket_ts <- ts(product1_data$category_basket, start = as.Date("2021-02-10"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_basket_dec <- decompose(x = category_basket_ts, type = "additive")
category_basket_model = auto.arima(category_basket_dec$random)
AIC(category_basket_model)
## [1] 20261.14
category_basket_model_forecast <- predict(category_basket_model, 
    n.ahead = 16)$pred
seasonality = category_basket_dec$seasonal[1:16]
last_trend_value <- tail(category_basket_dec$trend[!is.na(category_basket_dec$trend)], 
    1)
category_basket_model_forecast = category_basket_model_forecast + 
    last_trend_value + seasonality
category_basket_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 535119.9 537626.4 575312.0 545347.8 508458.8 524560.0 545403.9 528680.5
##  [9] 535887.3 573328.2 545817.4 509150.1 526335.6 544140.2 530285.0 536722.4

ARIMA model for Category_favored

we build arima model with decomposing at daily level for Category_favored regressor and forecast the value of category_favored on the desired day in the competition period.

ggplot(product1_data, aes(x = event_date, y = category_favored)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

fitted = auto.arima(product1_data$category_favored)
c = forecast(fitted, h = 16)
category_favored_model_forecast = c$mean
category_favored_model_forecast
## Time Series:
## Start = 373 
## End = 388 
## Frequency = 1 
##  [1] 6975.930 6957.469 6828.073 6854.624 6955.408 6925.019 6847.408 6878.740
##  [9] 6937.782 6907.394 6863.077 6891.366 6924.130 6898.571 6874.774 6897.335

ARIMA model for ty_visits

ggplot(product1_data, aes(x = event_date, y = ty_visits)) + geom_line(colour = "firebrick2", 
    size = 1.5) + theme_dark()

ty_visits_ts <- ts(product1_data$ty_visits, start = as.Date("2021-02-07"), 
    end = as.Date("2021-05-31"), frequency = 7)
ty_visits_dec <- decompose(x = ty_visits_ts, type = "additive")
ty_visits_model = auto.arima(ty_visits_dec$random)
AIC(ty_visits_model)
## [1] 27395.99
ty_visits_model_forecast <- predict(ty_visits_model, n.ahead = 16)$pred
seasonality = ty_visits_dec$seasonal[1:16]
last_trend_value <- tail(ty_visits_dec$trend[!is.na(ty_visits_dec$trend)], 
    1)
ty_visits_model_forecast = ty_visits_model_forecast + last_trend_value + 
    seasonality
ty_visits_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 119538618 120121585 127286838 125263590 119946681 118228548 120508962
##  [8] 119625332 120422516 126742923 124677015 119907503 118593636 120600688
## [15] 119696340 120668941

Forecasted value of sales quantity of product 1 in the desired day

We made our predictions in the linear model we built by using the fitted values of the regressors we found in the arima models.

predict(product1_reg, data.frame(price = 449.985, basket_count = basket_count_model_forecast[16], 
    category_basket = category_basket_model_forecast[16], category_favored = category_favored_model_forecast[16], 
    ty_visits = ty_visits_model_forecast[16]))
##        1 
## 3.389179

Product2

When we look at the sales chart of product2, it is understood that it is a summer product.Sales volumes showed an upward trend in May.

product2_data <- daily_data[product_content_id == 73318567]
ggplot(product2_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

If we see the sales period more closely:

ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >= 
    "2021-04-01"], aes(x = event_date, y = sold_count)) + geom_line(colour = "firebrick2", 
    size = 1.5) + theme_dark()

We established both arima and linear regression models for the product 2.Then we compared these forecasting models and chose the best one.At first, we built a linear regression model using all predictors for product2.

product2_reg <- lm(sold_count ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_visits + category_basket + 
    category_favored + category_brand_sold + ty_visits, product2_data)
summary(product2_reg)
## 
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_visits + category_basket + 
##     category_favored + category_brand_sold + ty_visits, data = product2_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.031  -6.321   0.987   5.517  22.830 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -2.309e+02  5.817e+02  -0.397  0.69247    
## price                3.951e+00  9.683e+00   0.408  0.68431    
## visit_count         -2.204e-03  1.065e-03  -2.069  0.04174 *  
## basket_count         2.143e-01  1.522e-02  14.078  < 2e-16 ***
## favored_count        4.831e-03  8.174e-03   0.591  0.55614    
## category_sold        1.396e-02  3.156e-03   4.423 3.04e-05 ***
## category_visits      9.219e-05  4.741e-05   1.945  0.05535 .  
## category_basket      1.047e-04  3.738e-05   2.802  0.00637 ** 
## category_favored    -1.513e-03  4.482e-04  -3.376  0.00114 ** 
## category_brand_sold -5.679e-04  2.864e-04  -1.983  0.05078 .  
## ty_visits           -1.190e-07  4.082e-08  -2.917  0.00459 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.916 on 80 degrees of freedom
##   (281 observations deleted due to missingness)
## Multiple R-squared:  0.985,  Adjusted R-squared:  0.9832 
## F-statistic: 526.2 on 10 and 80 DF,  p-value: < 2.2e-16

We’ve seen that basket_count and category_sold are most effective predictors.The adjusted R-squared value was also very high.The p value of the f test is fine.

product2_reg <- lm(sold_count ~ visit_count + basket_count + 
    favored_count + category_sold + category_visits + category_basket + 
    category_favored + category_brand_sold + ty_visits, product2_data)
summary(product2_reg)
## 
## Call:
## lm(formula = sold_count ~ visit_count + basket_count + favored_count + 
##     category_sold + category_visits + category_basket + category_favored + 
##     category_brand_sold + ty_visits, data = product2_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.1677  -1.0210  -0.1672   1.2207  26.3600 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.973e-01  5.174e-01   1.348  0.17863    
## visit_count         -3.953e-04  5.202e-04  -0.760  0.44784    
## basket_count         2.230e-01  7.330e-03  30.416  < 2e-16 ***
## favored_count       -9.409e-03  3.283e-03  -2.866  0.00440 ** 
## category_sold        2.643e-03  6.653e-04   3.973 8.58e-05 ***
## category_visits      3.666e-06  7.381e-06   0.497  0.61972    
## category_basket      5.862e-05  1.120e-05   5.233 2.84e-07 ***
## category_favored    -2.487e-04  6.428e-05  -3.869  0.00013 ***
## category_brand_sold -2.455e-04  9.108e-05  -2.695  0.00736 ** 
## ty_visits           -7.481e-08  9.687e-09  -7.722 1.13e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.552 on 362 degrees of freedom
## Multiple R-squared:  0.9863, Adjusted R-squared:  0.9859 
## F-statistic:  2886 on 9 and 362 DF,  p-value: < 2.2e-16
product2_reg <- lm(sold_count ~ visit_count + basket_count + 
    favored_count + category_sold + category_basket + category_favored + 
    category_brand_sold + ty_visits, product2_data)
summary(product2_reg)
## 
## Call:
## lm(formula = sold_count ~ visit_count + basket_count + favored_count + 
##     category_sold + category_basket + category_favored + category_brand_sold + 
##     ty_visits, data = product2_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.3270  -0.9939  -0.1711   1.2007  26.2894 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.746e-01  4.542e-01   1.265  0.20664    
## visit_count         -3.430e-04  5.089e-04  -0.674  0.50076    
## basket_count         2.238e-01  7.128e-03  31.396  < 2e-16 ***
## favored_count       -9.772e-03  3.197e-03  -3.056  0.00241 ** 
## category_sold        2.586e-03  6.546e-04   3.950 9.38e-05 ***
## category_basket      5.595e-05  9.814e-06   5.701 2.47e-08 ***
## category_favored    -2.291e-04  5.064e-05  -4.523 8.26e-06 ***
## category_brand_sold -2.343e-04  8.817e-05  -2.658  0.00822 ** 
## ty_visits           -7.235e-08  8.326e-09  -8.690  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.547 on 363 degrees of freedom
## Multiple R-squared:  0.9862, Adjusted R-squared:  0.9859 
## F-statistic:  3253 on 8 and 363 DF,  p-value: < 2.2e-16
product2_reg <- lm(sold_count ~ basket_count + favored_count + 
    category_sold + category_basket + category_favored + category_brand_sold + 
    ty_visits, product2_data)
summary(product2_reg)
## 
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold + 
##     category_basket + category_favored + category_brand_sold + 
##     ty_visits, data = product2_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.2786  -0.9534  -0.1677   1.1699  25.5679 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.598e-01  4.534e-01   1.235 0.217661    
## basket_count         2.202e-01  4.800e-03  45.879  < 2e-16 ***
## favored_count       -1.167e-02  1.516e-03  -7.695 1.34e-13 ***
## category_sold        2.553e-03  6.523e-04   3.914 0.000108 ***
## category_basket      5.476e-05  9.646e-06   5.677 2.81e-08 ***
## category_favored    -2.279e-04  5.058e-05  -4.506 8.90e-06 ***
## category_brand_sold -2.217e-04  8.610e-05  -2.575 0.010414 *  
## ty_visits           -7.257e-08  8.313e-09  -8.729  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.542 on 364 degrees of freedom
## Multiple R-squared:  0.9862, Adjusted R-squared:  0.986 
## F-statistic:  3723 on 7 and 364 DF,  p-value: < 2.2e-16

We further increased the adjusted R-squared value by removing redundant predictors.

product2_data <- cbind(product2_data, day_type)
names(product2_data)[14] <- "day_type"
product2_reg <- lm(sold_count ~ basket_count + favored_count + 
    category_sold + category_basket + category_favored + category_brand_sold + 
    ty_visits + as.factor(day_type), product2_data)
summary(product2_reg)
## 
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold + 
##     category_basket + category_favored + category_brand_sold + 
##     ty_visits + as.factor(day_type), data = product2_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26.3240  -1.1225  -0.0821   1.2715  25.3379 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4.751e-01  8.466e-01  -0.561 0.574981    
## basket_count          2.203e-01  4.831e-03  45.611  < 2e-16 ***
## favored_count        -1.174e-02  1.527e-03  -7.688 1.45e-13 ***
## category_sold         2.500e-03  6.646e-04   3.761 0.000198 ***
## category_basket       5.466e-05  9.709e-06   5.630 3.64e-08 ***
## category_favored     -2.233e-04  5.161e-05  -4.326 1.98e-05 ***
## category_brand_sold  -2.190e-04  8.672e-05  -2.525 0.011990 *  
## ty_visits            -7.301e-08  8.358e-09  -8.735  < 2e-16 ***
## as.factor(day_type)2  8.852e-01  1.080e+00   0.820 0.412832    
## as.factor(day_type)3  1.475e+00  1.079e+00   1.366 0.172698    
## as.factor(day_type)4  1.012e+00  1.080e+00   0.937 0.349198    
## as.factor(day_type)5  8.642e-01  1.086e+00   0.796 0.426773    
## as.factor(day_type)6  1.531e+00  1.083e+00   1.413 0.158421    
## as.factor(day_type)7  1.473e+00  1.079e+00   1.365 0.173003    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.565 on 358 degrees of freedom
## Multiple R-squared:  0.9863, Adjusted R-squared:  0.9858 
## F-statistic:  1989 on 13 and 358 DF,  p-value: < 2.2e-16

We added a daytype column to the product 1 data table to find out if there is daily seasonality in the sales data.The daytype variable returns from 1 to 7 for days.We also added daytype to the linear model.The daytype regressor had no significant effect on the model.Therefore, we removed the back daytype from the model.

product2_reg <- lm(sold_count ~ basket_count + favored_count + 
    category_sold + category_basket + category_favored + category_brand_sold + 
    ty_visits, product2_data)
summary(product2_reg)
## 
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold + 
##     category_basket + category_favored + category_brand_sold + 
##     ty_visits, data = product2_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.2786  -0.9534  -0.1677   1.1699  25.5679 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.598e-01  4.534e-01   1.235 0.217661    
## basket_count         2.202e-01  4.800e-03  45.879  < 2e-16 ***
## favored_count       -1.167e-02  1.516e-03  -7.695 1.34e-13 ***
## category_sold        2.553e-03  6.523e-04   3.914 0.000108 ***
## category_basket      5.476e-05  9.646e-06   5.677 2.81e-08 ***
## category_favored    -2.279e-04  5.058e-05  -4.506 8.90e-06 ***
## category_brand_sold -2.217e-04  8.610e-05  -2.575 0.010414 *  
## ty_visits           -7.257e-08  8.313e-09  -8.729  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.542 on 364 degrees of freedom
## Multiple R-squared:  0.9862, Adjusted R-squared:  0.986 
## F-statistic:  3723 on 7 and 364 DF,  p-value: < 2.2e-16
AIC(product2_reg)
## [1] 2339.655

ARIMA model for product2

We did decomposition at daily level for product1.Then, we build arima model with auto arima function.

datats2 <- ts(product2_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
ts_dec_add <- decompose(x = datats2, type = "additive")
model2 = auto.arima(ts_dec_add$random, max.p = 2, max.q = 2)
AIC(model2)
## [1] 19849.58

Forecasting for product2

We compared the aic values of the arima model and the linear model and chose the linear model.In order to be able to forecast with linear model in the test period, we must also forecast the value of regressors by constructing arima models.

ARIMA models for predictors of Linear Model of Product2

We build arima models by using past data for forecasting predictors or regressors of linear model of product 2.

ARIMA model for Basket_count

we build arima model with decomposing at daily level for basket_count regressor and forecast the value of basket_count on the desired day in competition period.

ggplot(product2_data, aes(x = event_date, y = basket_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >= 
    "2021-05-01"], aes(x = event_date, y = basket_count)) + geom_line(colour = "firebrick2", 
    size = 1.5) + theme_dark()

basket_count_ts <- ts(product2_data$basket_count, start = as.Date("2021-02-09"), 
    end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 8716.526
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 16)$pred
seasonality = basket_count_dec$seasonal[1:16]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)], 
    1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value + 
    seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 456.4614 493.7593 540.5048 520.2998 482.9880 472.1964 464.4085 461.4595
##  [9] 483.1937 516.1933 509.0330 486.0518 473.1542 466.6694 462.7477 480.3704

ARIMA model for favored_count

we build arima model with decomposing at daily level for favored_count regressor and forecast the value of favored_count on the desired day in the competition period.

ggplot(product2_data, aes(x = event_date, y = favored_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >= 
    "2021-05-01"], aes(x = event_date, y = favored_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

favored_count_ts <- ts(product2_data$favored_count, start = as.Date("2021-02-13"), 
    end = as.Date("2021-05-31"), frequency = 7)
favored_count_dec <- decompose(x = favored_count_ts, type = "additive")
favored_count_model = auto.arima(favored_count_dec$random, max.p = 3, 
    max.q = 3)
AIC(favored_count_model)
## [1] 10299.58
favored_count_model_forecast <- predict(favored_count_model, 
    n.ahead = 16)$pred
seasonality = favored_count_dec$seasonal[1:16]
last_trend_value <- tail(favored_count_dec$trend[!is.na(favored_count_dec$trend)], 
    1)
favored_count_model_forecast = favored_count_model_forecast + 
    last_trend_value + seasonality
favored_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 1824.777 2153.312 2681.863 2373.166 2044.357 2065.380 2141.401 2190.460
##  [9] 2101.854 1930.458 2055.127 2111.052 2070.982 2071.007 2077.568 2113.272

ARIMA model for category_sold

we build arima model with decomposing at daily level for category_sold regressor and forecast the value of category_sold on the desired day in the competition period.

ggplot(product2_data, aes(x = event_date, y = category_sold)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >= 
    "2021-05-01"], aes(x = event_date, y = category_sold)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

category_sold_ts <- ts(product2_data$category_sold, start = as.Date("2021-02-13"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random, max.p = 3, 
    max.q = 3)
AIC(category_sold_model)
## [1] 10544.62
category_sold_model_forecast <- predict(category_sold_model, 
    n.ahead = 16)$pred
seasonality = category_sold_dec$seasonal[1:16]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)], 
    1)
category_sold_model_forecast = category_sold_model_forecast + 
    last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 4055.494 4645.928 4668.572 4119.550 3755.913 3742.885 3994.022 4172.623
##  [9] 4224.037 4143.844 3968.022 3947.163 4032.239 4118.171 4096.042 4068.097

ARIMA model for category_basket

ggplot(product2_data, aes(x = event_date, y = category_basket)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >= 
    "2021-05-01"], aes(x = event_date, y = category_basket)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

category_basket_ts <- ts(product2_data$category_basket, start = as.Date("2021-02-13"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_basket_dec <- decompose(x = category_basket_ts, type = "additive")
category_basket_model = auto.arima(category_basket_dec$random, 
    max.p = 2, max.q = 2)
AIC(category_basket_model)
## [1] 18077.75
category_basket_model_forecast <- predict(category_basket_model, 
    n.ahead = 16)$pred
seasonality = category_basket_dec$seasonal[1:16]
last_trend_value <- tail(category_basket_dec$trend[!is.na(category_basket_dec$trend)], 
    1)
category_basket_model_forecast = category_basket_model_forecast + 
    last_trend_value + seasonality
category_basket_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 495831.4 344890.7 251235.0 241199.6 273443.1 340637.0 391791.4 409243.7
##  [9] 418889.1 409840.0 387829.4 309074.0 279205.0 293899.9 333281.9 375688.2

ARIMA model for Category_favored

fitted = auto.arima(product2_data$category_favored)
a = forecast(fitted, h = 16)
category_favored_model_forecast = a$mean

ARIMA model for Category_brand_sold

ggplot(product2_data, aes(x = event_date, y = category_brand_sold)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

category_brand_sold_ts <- ts(product2_data$category_brand_sold, 
    start = as.Date("2021-01-26"), end = as.Date("2021-05-31"), 
    frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts, 
    type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random, 
    max.p = 2, max.q = 2)
AIC(category_brand_sold_model)
## [1] 17870.52
category_brand_sold_model_forecast <- predict(category_brand_sold_model, 
    n.ahead = 16)$pred
seasonality = category_brand_sold_dec$seasonal[1:16]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)], 
    1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast + 
    last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 11563.63 14974.64 15552.65 14983.35 12818.26 12058.24 11823.64 11867.48
##  [9] 14118.44 16579.25 15880.05 12965.66 11394.18 11289.36 12795.02 15079.36

ARIMA model for ty_visits

ggplot(product2_data, aes(x = event_date, y = ty_visits)) + geom_line(colour = "firebrick2", 
    size = 1.5) + theme_dark()

ty_visits_ts <- ts(product2_data$ty_visits, start = as.Date("2021-01-31"), 
    end = as.Date("2021-05-31"), frequency = 7)
ty_visits_dec <- decompose(x = ty_visits_ts, type = "additive")
ty_visits_model = auto.arima(ty_visits_dec$random)
AIC(ty_visits_model)
## [1] 29151.66
ty_visits_model_forecast <- predict(ty_visits_model, n.ahead = 16)$pred
seasonality = ty_visits_dec$seasonal[1:16]
last_trend_value <- tail(ty_visits_dec$trend[!is.na(ty_visits_dec$trend)], 
    1)
ty_visits_model_forecast = ty_visits_model_forecast + last_trend_value + 
    seasonality
ty_visits_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 100795851 100711318 107932972 104924710  99275802 100100587 102194310
##  [8] 101905062 101209260 105388726 103236585  99133886 100120419 102106750
## [15] 101767595 101365562

Forecasted value of sales quantity of product 2 in the desired day

We made our predictions in the linear model we built by using the fitted values of the regressors we found in the arima models.

predict(product2_reg, data.frame(basket_count = basket_count_model_forecast[16], 
    favored_count = favored_count_model_forecast[16], category_sold = category_sold_model_forecast[16], 
    category_basket = category_basket_model_forecast[16], category_favored = category_favored_model_forecast[16], 
    category_brand_sold = category_brand_sold_model_forecast[16], 
    ty_visits = ty_visits_model_forecast[16]))
##        1 
## 99.87471

Product 3

Product3 is the product of the same category as product2.Sales increase during and before the summer months.

product3_data <- daily_data[product_content_id == 32737302]
ggplot(product3_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

If we see the sales period more closely:

ggplot(product3_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >= 
    "2021-02-01"], aes(x = event_date, y = sold_count)) + geom_line(colour = "firebrick2", 
    size = 1.5) + theme_dark()

We constructed a linear regression model for product3 using all predictors.

product3_reg <- lm(sold_count ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_visits + category_basket + 
    category_favored + category_brand_sold + ty_visits, product3_data)
summary(product3_reg)
## 
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_visits + category_basket + 
##     category_favored + category_brand_sold + ty_visits, data = product3_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.5489  -3.7959  -0.5739   4.8847  17.9934 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.528e+01  2.438e+01   1.447  0.15006    
## price               -5.575e-01  3.986e-01  -1.399  0.16412    
## visit_count         -1.673e-04  2.092e-03  -0.080  0.93637    
## basket_count         1.720e-01  1.327e-02  12.958  < 2e-16 ***
## favored_count       -7.974e-03  1.083e-02  -0.736  0.46280    
## category_sold        3.237e-03  1.233e-03   2.625  0.00964 ** 
## category_visits      2.208e-05  1.000e-05   2.208  0.02891 *  
## category_basket      3.483e-05  1.701e-05   2.047  0.04256 *  
## category_favored    -3.099e-04  1.078e-04  -2.875  0.00468 ** 
## category_brand_sold -2.751e-04  1.618e-04  -1.701  0.09121 .  
## ty_visits           -2.793e-08  2.391e-08  -1.168  0.24469    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.913 on 139 degrees of freedom
##   (222 observations deleted due to missingness)
## Multiple R-squared:  0.8373, Adjusted R-squared:  0.8256 
## F-statistic: 71.51 on 10 and 139 DF,  p-value: < 2.2e-16

We’ve seen that basket_count, category_sold and category_favored are effective predictors.The adjusted R-squared value was also high.The p value of the f test is fine.

product3_reg <- lm(sold_count ~ basket_count + favored_count + 
    category_sold + category_visits + category_basket + category_favored + 
    category_brand_sold, product3_data)
summary(product3_reg)
## 
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold + 
##     category_visits + category_basket + category_favored + category_brand_sold, 
##     data = product3_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.1505  -0.7810  -0.1247   0.8607  21.1286 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.745e-01  3.727e-01   1.005 0.315600    
## basket_count         1.751e-01  6.137e-03  28.533  < 2e-16 ***
## favored_count       -1.419e-02  2.973e-03  -4.774 2.62e-06 ***
## category_sold        2.781e-03  6.039e-04   4.605 5.72e-06 ***
## category_visits      1.230e-05  4.195e-06   2.933 0.003568 ** 
## category_basket      1.450e-05  6.045e-06   2.399 0.016959 *  
## category_favored    -1.949e-04  5.141e-05  -3.791 0.000176 ***
## category_brand_sold -1.145e-04  6.319e-05  -1.813 0.070711 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.389 on 364 degrees of freedom
## Multiple R-squared:  0.9349, Adjusted R-squared:  0.9336 
## F-statistic: 746.6 on 7 and 364 DF,  p-value: < 2.2e-16
AIC(product3_reg)
## [1] 2165.977

We further increased the adjusted R-squared value by removing redundant predictors.

product3_data <- cbind(product3_data, day_type)
names(product3_data)[14] <- "day_type"
product3_reg <- lm(sold_count ~ basket_count + favored_count + 
    category_sold + category_visits + category_basket + category_favored + 
    category_brand_sold + as.factor(day_type), product3_data)
summary(product3_reg)
## 
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold + 
##     category_visits + category_basket + category_favored + category_brand_sold + 
##     as.factor(day_type), data = product3_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5747  -1.1502   0.0102   0.9884  20.5998 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.756e-01  6.711e-01  -0.411 0.681580    
## basket_count          1.756e-01  6.164e-03  28.492  < 2e-16 ***
## favored_count        -1.442e-02  2.991e-03  -4.821 2.11e-06 ***
## category_sold         2.715e-03  6.118e-04   4.438 1.21e-05 ***
## category_visits       1.266e-05  4.220e-06   2.999 0.002894 ** 
## category_basket       1.417e-05  6.081e-06   2.330 0.020344 *  
## category_favored     -1.921e-04  5.204e-05  -3.692 0.000257 ***
## category_brand_sold  -1.099e-04  6.375e-05  -1.725 0.085452 .  
## as.factor(day_type)2  4.389e-01  8.520e-01   0.515 0.606769    
## as.factor(day_type)3  2.546e-01  8.512e-01   0.299 0.765041    
## as.factor(day_type)4  1.481e+00  8.518e-01   1.739 0.082884 .  
## as.factor(day_type)5  1.479e-01  8.564e-01   0.173 0.862964    
## as.factor(day_type)6  1.182e+00  8.553e-01   1.382 0.167878    
## as.factor(day_type)7  9.857e-01  8.523e-01   1.157 0.248220    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.392 on 358 degrees of freedom
## Multiple R-squared:  0.9359, Adjusted R-squared:  0.9335 
## F-statistic: 401.7 on 13 and 358 DF,  p-value: < 2.2e-16

We added the day type regressor to the model, if the data includes daily seasonality, the model is expected to improve. The daytype regressor had no significant effect on the model.Therefore, we removed daytype regressor from the model.

product3_reg <- lm(sold_count ~ basket_count + favored_count + 
    category_sold + category_visits + category_basket + category_favored + 
    category_brand_sold, product3_data)
summary(product3_reg)
## 
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold + 
##     category_visits + category_basket + category_favored + category_brand_sold, 
##     data = product3_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.1505  -0.7810  -0.1247   0.8607  21.1286 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.745e-01  3.727e-01   1.005 0.315600    
## basket_count         1.751e-01  6.137e-03  28.533  < 2e-16 ***
## favored_count       -1.419e-02  2.973e-03  -4.774 2.62e-06 ***
## category_sold        2.781e-03  6.039e-04   4.605 5.72e-06 ***
## category_visits      1.230e-05  4.195e-06   2.933 0.003568 ** 
## category_basket      1.450e-05  6.045e-06   2.399 0.016959 *  
## category_favored    -1.949e-04  5.141e-05  -3.791 0.000176 ***
## category_brand_sold -1.145e-04  6.319e-05  -1.813 0.070711 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.389 on 364 degrees of freedom
## Multiple R-squared:  0.9349, Adjusted R-squared:  0.9336 
## F-statistic: 746.6 on 7 and 364 DF,  p-value: < 2.2e-16
AIC(product3_reg)
## [1] 2165.977

ARIMA model for product 3

We did decomposition at daily level for product1.Then, we build arima model with auto arima function.

datats3 <- ts(product3_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
ts_dec_add2 <- decompose(x = datats3, type = "additive")
plot(ts_dec_add2)

model3 = auto.arima(ts_dec_add2$random, max.p = 2, max.q = 2)
AIC(model3)
## [1] 15866.43

Forecasting for product3

We compared the aic values of the arima model and the linear model and chose the linear model.In order to be able to forecast with linear model in the test period, we must also forecast the value of regressors by constructing arima models.

ARIMA models for predictors of Linear Model of Product3

We build arima models by using past data for forecasting predictors or regressors of linear model of product 3.

ARIMA model for Basket_count

we build arima model with decomposing at daily level for basket_count regressor and forecast the value of basket_count on the desired day in competition period.

ggplot(product3_data, aes(x = event_date, y = basket_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

ggplot(product3_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >= 
    "2021-05-01"], aes(x = event_date, y = basket_count)) + geom_line(colour = "firebrick2", 
    size = 1.5) + theme_dark()

basket_count_ts <- ts(product3_data$basket_count, start = as.Date("2021-02-13"), 
    end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 6917.446
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 16)$pred
seasonality = basket_count_dec$seasonal[1:16]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)], 
    1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value + 
    seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 250.1791 263.0404 266.2987 251.2876 248.4188 250.5081 251.5405 252.5490
##  [9] 256.6915 258.2287 250.0061 249.1856 251.0730 251.6152 252.7960 256.0298

ARIMA model for favored_count

ggplot(product3_data, aes(x = event_date, y = favored_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

ggplot(product3_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >= 
    "2021-05-01"], aes(x = event_date, y = favored_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

favored_count_ts <- ts(product3_data$favored_count, start = as.Date("2021-02-13"), 
    end = as.Date("2021-05-31"), frequency = 7)
favored_count_dec <- decompose(x = favored_count_ts, type = "additive")
favored_count_model = auto.arima(favored_count_dec$random)
AIC(favored_count_model)
## [1] 8573.249
favored_count_model_forecast <- predict(favored_count_model, 
    n.ahead = 16)$pred
seasonality = favored_count_dec$seasonal[1:16]
last_trend_value <- tail(favored_count_dec$trend[!is.na(favored_count_dec$trend)], 
    1)
favored_count_model_forecast = favored_count_model_forecast + 
    last_trend_value + seasonality
favored_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 407.7535 433.2656 441.7602 416.3097 404.7530 403.8694 404.2885 407.7535
##  [9] 433.2656 441.7602 416.3097 404.7530 403.8694 404.2885 407.7535 433.2656

ARIMA model for category_sold

ggplot(product3_data, aes(x = event_date, y = category_sold)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

category_sold_ts <- ts(product3_data$category_sold, start = as.Date("2021-04-07"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random)
AIC(category_sold_model)
## [1] 5207.815
category_sold_model_forecast <- predict(category_sold_model, 
    n.ahead = 16)$pred
seasonality = category_sold_dec$seasonal[1:16]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)], 
    1)
category_sold_model_forecast = category_sold_model_forecast + 
    last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 4684.988 4829.658 4458.404 4249.587 4258.094 4427.342 4397.424 3995.785
##  [9] 4200.285 4119.709 4279.729 4276.953 4215.899 4252.005 4399.121 4460.895

ARIMA model for category_visits

ggplot(product3_data, aes(x = event_date, y = category_visits)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

category_visits_ts <- ts(product3_data$category_visits, start = as.Date("2021-04-07"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_visits_dec <- decompose(x = category_visits_ts, type = "additive")
category_visits_model = auto.arima(category_visits_dec$random, 
    max.p = 3, max.q = 3)
AIC(category_visits_model)
## [1] 9171.247
category_visits_model_forecast <- predict(category_visits_model, 
    n.ahead = 16)$pred
seasonality = category_visits_dec$seasonal[1:16]
last_trend_value <- tail(category_visits_dec$trend[!is.na(category_visits_dec$trend)], 
    1)
category_visits_model_forecast = category_visits_model_forecast + 
    last_trend_value + seasonality
category_visits_model_forecast = abs(category_visits_model_forecast)
category_visits_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 1523918.18  940639.85   83084.83  470609.77  162197.03 1143245.98
##  [7] 1415453.68  722371.87  166027.58  335869.24  368998.81 1190433.93
## [13] 1285940.95  549429.64  223524.92  206304.91

ARIMA model for category_basket

ggplot(product3_data, aes(x = event_date, y = category_basket)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

category_basket_ts <- ts(product3_data$category_basket, start = as.Date("2021-02-13"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_basket_dec <- decompose(x = category_basket_ts, type = "additive")
category_basket_model = auto.arima(category_basket_dec$random, 
    max.p = 2, max.q = 2)
AIC(category_basket_model)
## [1] 18077.75
category_basket_model_forecast <- predict(category_basket_model, 
    n.ahead = 16)$pred
seasonality = category_basket_dec$seasonal[1:16]
last_trend_value <- tail(category_basket_dec$trend[!is.na(category_basket_dec$trend)], 
    1)
category_basket_model_forecast = category_basket_model_forecast + 
    last_trend_value + seasonality
category_basket_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1] 495831.4 344890.7 251235.0 241199.6 273443.1 340637.0 391791.4 409243.7
##  [9] 418889.1 409840.0 387829.4 309074.0 279205.0 293899.9 333281.9 375688.2

ARIMA model for Category_favored

fitted = auto.arima(product3_data$category_favored)
b = forecast(fitted, h = 16)
category_favored_model_forecast = b$mean
category_favored_model_forecast
## Time Series:
## Start = 373 
## End = 388 
## Frequency = 1 
##  [1] 9601.316 9511.373 9382.591 9259.483 9188.624 9151.846 9134.385 9128.304
##  [9] 9127.363 9128.146 9129.241 9130.118 9130.676 9130.978 9131.119 9131.170

ARIMA model for Category_brand_sold

ggplot(product3_data, aes(x = event_date, y = category_brand_sold)) + 
    geom_line(colour = "firebrick2", size = 1.5) + theme_dark()

category_brand_sold_ts <- ts(product3_data$category_brand_sold, 
    start = as.Date("2021-01-31"), end = as.Date("2021-05-31"), 
    frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts, 
    type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random, 
    max.p = 2, max.q = 2)
AIC(category_brand_sold_model)
## [1] 17175.24
category_brand_sold_model_forecast <- predict(category_brand_sold_model, 
    n.ahead = 16)$pred
seasonality = category_brand_sold_dec$seasonal[1:16]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)], 
    1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast + 
    last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18780, 3) 
## Frequency = 7 
##  [1]  84717.26 104296.22  89807.20  69501.46  63849.94  73107.90  82429.50
##  [8]  83365.63  79142.25  73190.98  74166.85  77429.82  79073.77  77369.99
## [15]  75087.70  76381.26

Forecasted value of sales quantity of product 3 in the desired day

We made our predictions in the linear model we built by using the fitted values of the regressors we found in the arima models.

predict(product3_reg, data.frame(basket_count = basket_count_model_forecast[16], 
    favored_count = favored_count_model_forecast[16], category_sold = category_sold_model_forecast[16], 
    category_visits = category_visits_model_forecast[16], category_basket = category_basket_model_forecast[16], 
    category_favored = category_favored_model_forecast[16], category_brand_sold = category_brand_sold_model_forecast[16]))
##       1 
## 48.9182

Including Plots

data_path = "/Users/onurcanaydin/Desktop/360 proje/ProjectRawData.csv"
projectdata <- fread(data_path)

You can also embed plots, for example:

IslakMendil <- projectdata[product_content_id == 4066298]
head(IslakMendil)
##    event_date product_content_id    price sold_count visit_count basket_count
## 1: 2021-05-31            4066298 65.80651       1398       19102         5096
## 2: 2021-05-30            4066298 72.50000        598        6217         1716
## 3: 2021-05-29            4066298 72.51379        528        6251         1782
## 4: 2021-05-28            4066298 72.50000        538        5564         1507
## 5: 2021-05-27            4066298 72.50000        588        5857         1646
## 6: 2021-05-26            4066298 72.50000        589        6089         1771
##    favored_count category_sold category_visits category_basket category_favored
## 1:          1703          6547          108811           28558            11913
## 2:           494          3691           52128           12777             5221
## 3:           509          3604           54600           13328             5178
## 4:           405          1598            3308           48466            12414
## 5:           397          1775            3789           53058            13480
## 6:           409          1948            4017           56226            14497
##    category_brand_sold ty_visits
## 1:                4286 125439876
## 2:                1729 131821083
## 3:                1739 129670029
## 4:                4389 103514886
## 5:                4685 107391579
## 6:                5388 106195988
ts.plot(IslakMendil[, c("sold_count")], main = "Daily Sales Quantity", 
    xlab = "Time", ylab = "Sales Quantity")

In the plot we can see that sales quantity goes costant most of the time but sometimes there are huge increases probably due to the discounts.

We are adding month, day, trend attributes to the data in order to better check and we are taking the logarithm of the sold_count in order to reduce the variance’s effect.

IslakMendil[, `:=`(month, month(event_date))]
IslakMendil[, `:=`(day, lubridate::wday(event_date))]
IslakMendil[, `:=`(trend, 1:.N)]
IslakMendil[, `:=`(log_sold, log(sold_count))]
ts.plot(IslakMendil[, c("log_sold")], main = "Daily Log Sales Quantity", 
    xlab = "Time", ylab = "Sales Quantity Log")

Regression Models

After trying lots of models, the final model is like that with 0.9462 Adjusted R Squared value.

lmIslakMendil <- lm(sold_count ~ basket_count + category_sold + 
    category_visits + category_favored + category_brand_sold, 
    data = IslakMendil)
summary(lmIslakMendil)
## 
## Call:
## lm(formula = sold_count ~ basket_count + category_sold + category_visits + 
##     category_favored + category_brand_sold, data = IslakMendil)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -515.75  -34.94    3.30   41.24  924.82 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.0030329  8.8606618   1.016     0.31    
## basket_count         0.2482176  0.0103450  23.994  < 2e-16 ***
## category_sold        0.1951942  0.0081431  23.970  < 2e-16 ***
## category_visits     -0.0081666  0.0009448  -8.643  < 2e-16 ***
## category_favored    -0.0127085  0.0022058  -5.761 1.77e-08 ***
## category_brand_sold -0.0209877  0.0022743  -9.228  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 97.93 on 366 degrees of freedom
## Multiple R-squared:  0.947,  Adjusted R-squared:  0.9462 
## F-statistic:  1307 on 5 and 366 DF,  p-value: < 2.2e-16
checkresiduals(lmIslakMendil)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals
## LM test = 37.645, df = 10, p-value = 4.375e-05

Residuals have zero mean and around zero variance except extraordinary a few points. Residuals look like normally distributed.

##Decomposing

We took the logarithm of the model and we will go with the additive decomposition.

# time series and arima
tsIslakMendil <- ts(IslakMendil$sold_count, frequency = 7)
ts.plot(tsIslakMendil)

decompIslakMendil <- decompose(tsIslakMendil, type = "additive")

acf(tsIslakMendil)

pacf(tsIslakMendil)

finalARIMAmodel2 <- arima(decompIslakMendil$random, order = c(0, 
    0, 1))
finalARIMAmodel2
## 
## Call:
## arima(x = decompIslakMendil$random, order = c(0, 0, 1))
## 
## Coefficients:
##          ma1  intercept
##       0.2775    -0.3595
## s.e.  0.0471    16.3751
## 
## sigma^2 estimated as 60205:  log likelihood = -2533.38,  aic = 5072.76

By checking the ACF and PACF plots, we decided to create ARIMA(0,0,1) models.

##ARIMA Models of Attributes

ARIMA model for Basket_count

basket_count_ts <- ts(IslakMendil$basket_count, start = as.Date("2021-01-04"), 
    end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 15816.63
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 1)$pred
seasonality = basket_count_dec$seasonal[1:1]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)], 
    1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value + 
    seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 1584.868

ARIMA model for Category_sold

category_sold_ts <- ts(IslakMendil$category_sold, start = as.Date("2021-02-10"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random)
AIC(category_sold_model)
## [1] 12217.86
category_sold_model_forecast <- predict(category_sold_model, 
    n.ahead = 1)$pred
seasonality = category_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)], 
    1)
category_sold_model_forecast = category_sold_model_forecast + 
    last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 1867.783

ARIMA model for Category_favored

category_favored_ts <- ts(IslakMendil$category_favored, start = as.Date("2021-02-10"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_favored_dec <- decompose(x = category_favored_ts, type = "additive")
category_favored_model = auto.arima(category_favored_dec$random)
AIC(category_favored_model)
## [1] 14350.23
category_favored_model_forecast <- predict(category_favored_model, 
    n.ahead = 1)$pred
seasonality = category_favored_dec$seasonal[1:1]
last_trend_value <- tail(category_favored_dec$trend[!is.na(category_favored_dec$trend)], 
    1)
category_favored_model_forecast = category_favored_model_forecast + 
    last_trend_value + seasonality
category_favored_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 15301.39

ARIMA model for Category_visits

category_visits_ts <- ts(IslakMendil$category_visits, start = as.Date("2021-02-10"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_visits_dec <- decompose(x = category_visits_ts, type = "additive")
category_visits_model = auto.arima(category_visits_dec$random)
AIC(category_visits_model)
## [1] 15081.26
category_visits_model_forecast <- predict(category_visits_model, 
    n.ahead = 1)$pred
seasonality = category_visits_dec$seasonal[1:1]
last_trend_value <- tail(category_visits_dec$trend[!is.na(category_visits_dec$trend)], 
    1)
category_visits_model_forecast = category_visits_model_forecast + 
    last_trend_value + seasonality
category_visits_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 4019.674

ARIMA model for Category_brand_sold

category_brand_sold_ts <- ts(IslakMendil$category_brand_sold, 
    start = as.Date("2021-02-10"), end = as.Date("2021-05-31"), 
    frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts, 
    type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random)
AIC(category_brand_sold_model)
## [1] 12442.73
category_brand_sold_model_forecast <- predict(category_brand_sold_model, 
    n.ahead = 1)$pred
seasonality = category_brand_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)], 
    1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast + 
    last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 5193.39

We have forecasted the regressors belong to the lmIslakMendil model, now, we will predict the sold_quantity.

Prediction

predict(lmIslakMendil, data.frame(basket_count = basket_count_model_forecast, 
    category_sold = category_sold_model_forecast, category_favored = category_favored_model_forecast, 
    category_visits = category_visits_model_forecast, category_brand_sold = category_brand_sold_model_forecast))
##        1 
## 430.6934

Our prediction for the quantity sold is 430.6934

head(IslakMendil$sold_count, 50)
##  [1] 1398  598  528  538  588  589  568  632  727  606  581  768  611  854  940
## [16]  198  144  128   88  263  204  250  131  121  150  183  226  202  249  427
## [31]  489  613  916  993 1132  688  212  352 1140  726  258  292  408  481  422
## [46]  447  533  366  358  329

##YuzTemizleyici

YuzTemizleyici <- projectdata[product_content_id == 85004]
head(YuzTemizleyici)
##    event_date product_content_id    price sold_count visit_count basket_count
## 1: 2021-05-31              85004 87.39562         80        6002          744
## 2: 2021-05-30              85004 88.62941         85        5265          633
## 3: 2021-05-29              85004 89.57701         67        4925          570
## 4: 2021-05-28              85004 89.22423         52        3774          397
## 5: 2021-05-27              85004 87.25352         71        4189          515
## 6: 2021-05-26              85004 82.22640        125        5641          982
##    favored_count category_sold category_visits category_basket category_favored
## 1:          1642          5048          236197           33681            40472
## 2:          1104          4117          215260           25181            36225
## 3:           913          3872          198431           23790            31700
## 4:           868           309            3057          163110            18821
## 5:           985           411            3589          176215            21144
## 6:           932           971            5107          217245            30724
##    category_brand_sold ty_visits
## 1:                 743 125439876
## 2:                 430 131821083
## 3:                 437 129670029
## 4:               25401 103514886
## 5:               25610 107391579
## 6:               30844 106195988
summary(YuzTemizleyici)
##    event_date         product_content_id     price         sold_count   
##  Min.   :2020-05-25   Min.   :85004      Min.   :64.90   Min.   : 14.0  
##  1st Qu.:2020-08-25   1st Qu.:85004      1st Qu.:74.58   1st Qu.: 33.0  
##  Median :2020-11-26   Median :85004      Median :77.76   Median : 57.0  
##  Mean   :2020-11-26   Mean   :85004      Mean   :77.66   Mean   : 74.2  
##  3rd Qu.:2021-02-27   3rd Qu.:85004      3rd Qu.:80.68   3rd Qu.: 87.5  
##  Max.   :2021-05-31   Max.   :85004      Max.   :89.95   Max.   :447.0  
##   visit_count    basket_count    favored_count    category_sold   
##  Min.   :   0   Min.   :  62.0   Min.   :   0.0   Min.   :  91.0  
##  1st Qu.:   0   1st Qu.: 154.8   1st Qu.:   0.0   1st Qu.: 217.8  
##  Median :   0   Median : 288.0   Median :   0.0   Median : 342.0  
##  Mean   :1405   Mean   : 354.1   Mean   : 401.5   Mean   : 484.4  
##  3rd Qu.:3183   3rd Qu.: 482.0   3rd Qu.: 696.0   3rd Qu.: 547.5  
##  Max.   :8325   Max.   :1338.0   Max.   :2882.0   Max.   :5048.0  
##  category_visits  category_basket  category_favored category_brand_sold
##  Min.   :  1476   Min.   :     0   Min.   : 7465    Min.   :    0      
##  1st Qu.:  2688   1st Qu.:     0   1st Qu.:13236    1st Qu.:    0      
##  Median :  3468   Median :     0   Median :18423    Median :    0      
##  Mean   :  6005   Mean   : 75121   Mean   :22250    Mean   :16556      
##  3rd Qu.:  4610   3rd Qu.:186296   3rd Qu.:26998    3rd Qu.:31558      
##  Max.   :236197   Max.   :467288   Max.   :69429    Max.   :73350      
##    ty_visits        
##  Min.   :        1  
##  1st Qu.:        1  
##  Median :        1  
##  Mean   : 39145174  
##  3rd Qu.: 97619487  
##  Max.   :178545693
ts.plot(YuzTemizleyici[, c("sold_count")], main = "Daily Sales Quantity", 
    xlab = "Time", ylab = "Sales Quantity")

acf(YuzTemizleyici$sold_count, lag.max = 90)

From the Daily Sales Quantity graph, we see some seasonality. Also, there are two separate means which is splitted approximately at Time 200.

Also, we can mention some outliers and we want to avoid from these outliers.

If we check the autocorrelation, we see decreasing trend and some pattern at the level of lag=90, which implies that there is a trend and seasonality in the sales data.

for (i in 1:length(YuzTemizleyici$sold_count)) {

    if (YuzTemizleyici$sold_count[i] >= 200) {
        YuzTemizleyici$sold_count[i] = mean(YuzTemizleyici$sold_count[i - 
            7:i + 7])

    }

}
ts.plot(YuzTemizleyici[, c("sold_count")], main = "Daily Sales Quantity", 
    xlab = "Time", ylab = "Sales Quantity")

acf(YuzTemizleyici$sold_count, lag.max = 90)

We determined a sales quantity level, which is 200 and choose these points as outliers. In order to get rid of outliers, we take the average of closest 15 days to the outlier points and assigned these new value to the outlier point.In this way, we get rid of outliers but autocorrelation function gives higher results.

Regression Model

We add day, month and trend columns to the data.

Since the variance looks high and getting higher, we take the log of the sold_count.

YuzTemizleyici[, `:=`(month, month(event_date))]
YuzTemizleyici[, `:=`(day, lubridate::wday(event_date))]
YuzTemizleyici[, `:=`(trend, 1:.N)]
YuzTemizleyici[, `:=`(log_sold, log(sold_count))]
ts.plot(YuzTemizleyici[, c("log_sold")], main = "Daily Log Sales Quantity", 
    xlab = "Time", ylab = "Sales Quantity Log")

From now on, we will be trying different lm models with different attributes and we want to achieve the best regresssion model.

lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_visits + category_basket + 
    category_favored + category_brand_sold + ty_visits + month + 
    day + trend, data = YuzTemizleyici)
summary(lmYuzTemizleyici)
## 
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_visits + category_basket + 
##     category_favored + category_brand_sold + ty_visits + month + 
##     day + trend, data = YuzTemizleyici)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.094  -8.726  -0.560   7.231  72.050 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.631e+02  2.715e+01   6.006 4.66e-09 ***
## price               -1.947e+00  3.029e-01  -6.427 4.16e-10 ***
## visit_count          1.258e-02  2.603e-03   4.832 2.01e-06 ***
## basket_count         9.236e-02  1.223e-02   7.549 3.67e-13 ***
## favored_count       -1.920e-02  4.570e-03  -4.202 3.35e-05 ***
## category_sold       -2.580e-02  5.677e-03  -4.545 7.53e-06 ***
## category_visits      5.931e-05  1.305e-04   0.455  0.64965    
## category_basket     -4.067e-04  5.909e-05  -6.883 2.64e-11 ***
## category_favored     1.652e-03  1.888e-04   8.746  < 2e-16 ***
## category_brand_sold  7.575e-04  1.507e-04   5.026 7.92e-07 ***
## ty_visits            3.000e-07  1.024e-07   2.930  0.00361 ** 
## month               -8.123e-01  4.014e-01  -2.024  0.04374 *  
## day                 -5.860e-01  4.594e-01  -1.276  0.20292    
## trend               -8.099e-04  2.097e-02  -0.039  0.96922    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.38 on 358 degrees of freedom
## Multiple R-squared:  0.8128, Adjusted R-squared:  0.806 
## F-statistic: 119.5 on 13 and 358 DF,  p-value: < 2.2e-16
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_basket + category_favored + 
    category_brand_sold + ty_visits + month + trend, data = YuzTemizleyici)
summary(lmYuzTemizleyici)
## 
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_basket + category_favored + 
##     category_brand_sold + ty_visits + month + trend, data = YuzTemizleyici)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -65.787  -8.952  -0.527   6.851  73.381 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.543e+02  2.637e+01   5.853 1.09e-08 ***
## price               -1.879e+00  2.986e-01  -6.293 9.06e-10 ***
## visit_count          1.290e-02  2.368e-03   5.449 9.43e-08 ***
## basket_count         9.007e-02  1.019e-02   8.835  < 2e-16 ***
## favored_count       -1.906e-02  4.564e-03  -4.175 3.74e-05 ***
## category_sold       -2.341e-02  3.326e-03  -7.039 9.83e-12 ***
## category_basket     -4.179e-04  5.236e-05  -7.982 1.95e-14 ***
## category_favored     1.645e-03  1.884e-04   8.730  < 2e-16 ***
## category_brand_sold  7.810e-04  1.493e-04   5.230 2.88e-07 ***
## ty_visits            3.107e-07  1.003e-07   3.098   0.0021 ** 
## month               -7.796e-01  3.981e-01  -1.958   0.0510 .  
## trend                2.807e-03  2.080e-02   0.135   0.8927    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.38 on 360 degrees of freedom
## Multiple R-squared:  0.8118, Adjusted R-squared:  0.806 
## F-statistic: 141.2 on 11 and 360 DF,  p-value: < 2.2e-16
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_basket + category_favored + 
    category_brand_sold + ty_visits + month + trend + as.factor(day), 
    data = YuzTemizleyici)
summary(lmYuzTemizleyici)
## 
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_basket + category_favored + 
##     category_brand_sold + ty_visits + month + trend + as.factor(day), 
##     data = YuzTemizleyici)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.705  -8.185  -1.266   7.948  70.298 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.639e+02  2.674e+01   6.130 2.34e-09 ***
## price               -2.047e+00  3.000e-01  -6.823 3.86e-11 ***
## visit_count          1.326e-02  2.356e-03   5.626 3.76e-08 ***
## basket_count         8.573e-02  1.015e-02   8.449 7.74e-16 ***
## favored_count       -1.783e-02  4.522e-03  -3.942 9.73e-05 ***
## category_sold       -2.477e-02  3.311e-03  -7.482 5.85e-13 ***
## category_basket     -4.480e-04  5.240e-05  -8.549 3.77e-16 ***
## category_favored     1.670e-03  1.863e-04   8.964  < 2e-16 ***
## category_brand_sold  7.957e-04  1.482e-04   5.370 1.43e-07 ***
## ty_visits            3.729e-07  1.008e-07   3.701 0.000249 ***
## month               -7.743e-01  3.956e-01  -1.957 0.051088 .  
## trend                5.458e-03  2.081e-02   0.262 0.793241    
## as.factor(day)2      6.161e+00  3.357e+00   1.835 0.067328 .  
## as.factor(day)3      4.774e+00  3.393e+00   1.407 0.160257    
## as.factor(day)4      4.780e+00  3.412e+00   1.401 0.162011    
## as.factor(day)5      8.350e+00  3.396e+00   2.458 0.014431 *  
## as.factor(day)6      9.185e-01  3.399e+00   0.270 0.787122    
## as.factor(day)7     -3.305e+00  3.354e+00  -0.985 0.325145    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.13 on 354 degrees of freedom
## Multiple R-squared:  0.8202, Adjusted R-squared:  0.8115 
## F-statistic: 94.97 on 17 and 354 DF,  p-value: < 2.2e-16
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_basket + category_favored + 
    category_brand_sold + ty_visits + as.factor(month) + as.factor(day), 
    data = YuzTemizleyici)
summary(lmYuzTemizleyici)
## 
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_basket + category_favored + 
##     category_brand_sold + ty_visits + as.factor(month) + as.factor(day), 
##     data = YuzTemizleyici)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.245  -9.025  -0.907   7.586  74.700 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.906e+02  2.819e+01   6.763 5.80e-11 ***
## price               -2.229e+00  3.273e-01  -6.809 4.38e-11 ***
## visit_count          1.209e-02  2.668e-03   4.530 8.13e-06 ***
## basket_count         7.837e-02  1.174e-02   6.673 9.99e-11 ***
## favored_count       -1.618e-02  4.710e-03  -3.435 0.000664 ***
## category_sold       -2.383e-02  3.487e-03  -6.834 3.74e-11 ***
## category_basket     -3.549e-04  6.846e-05  -5.184 3.70e-07 ***
## category_favored     1.739e-03  1.933e-04   8.995  < 2e-16 ***
## category_brand_sold  4.423e-04  2.453e-04   1.803 0.072219 .  
## ty_visits            3.725e-07  9.860e-08   3.778 0.000186 ***
## as.factor(month)2   -1.843e+01  7.919e+00  -2.327 0.020564 *  
## as.factor(month)3   -1.525e+01  7.825e+00  -1.949 0.052144 .  
## as.factor(month)4   -2.222e+01  8.397e+00  -2.646 0.008508 ** 
## as.factor(month)5   -1.508e+01  8.285e+00  -1.821 0.069534 .  
## as.factor(month)6   -1.015e+01  8.012e+00  -1.267 0.206124    
## as.factor(month)7   -1.988e+01  8.140e+00  -2.443 0.015077 *  
## as.factor(month)8   -1.959e+01  8.303e+00  -2.360 0.018849 *  
## as.factor(month)9   -1.744e+01  8.239e+00  -2.117 0.034979 *  
## as.factor(month)10  -1.589e+01  8.027e+00  -1.979 0.048561 *  
## as.factor(month)11  -2.218e+01  8.196e+00  -2.706 0.007147 ** 
## as.factor(month)12  -9.922e+00  4.930e+00  -2.013 0.044937 *  
## as.factor(day)2      5.948e+00  3.345e+00   1.778 0.076268 .  
## as.factor(day)3      3.933e+00  3.415e+00   1.152 0.250177    
## as.factor(day)4      3.736e+00  3.442e+00   1.085 0.278545    
## as.factor(day)5      7.402e+00  3.434e+00   2.155 0.031820 *  
## as.factor(day)6      5.805e-02  3.423e+00   0.017 0.986480    
## as.factor(day)7     -4.110e+00  3.362e+00  -1.223 0.222279    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.03 on 345 degrees of freedom
## Multiple R-squared:  0.8268, Adjusted R-squared:  0.8138 
## F-statistic: 63.36 on 26 and 345 DF,  p-value: < 2.2e-16
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_basket + category_favored + 
    category_brand_sold + ty_visits + as.factor(month) + trend + 
    as.factor(day), data = YuzTemizleyici)
summary(lmYuzTemizleyici)
## 
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_basket + category_favored + 
##     category_brand_sold + ty_visits + as.factor(month) + trend + 
##     as.factor(day), data = YuzTemizleyici)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.107  -9.139  -0.956   7.566  74.958 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.946e+02  3.062e+01   6.355 6.60e-10 ***
## price               -2.247e+00  3.323e-01  -6.761 5.87e-11 ***
## visit_count          1.186e-02  2.754e-03   4.307 2.16e-05 ***
## basket_count         7.798e-02  1.182e-02   6.600 1.56e-10 ***
## favored_count       -1.608e-02  4.726e-03  -3.403 0.000746 ***
## category_sold       -2.378e-02  3.495e-03  -6.804 4.53e-11 ***
## category_basket     -3.458e-04  7.377e-05  -4.688 3.98e-06 ***
## category_favored     1.748e-03  1.953e-04   8.949  < 2e-16 ***
## category_brand_sold  4.162e-04  2.577e-04   1.615 0.107202    
## ty_visits            3.518e-07  1.165e-07   3.019 0.002724 ** 
## as.factor(month)2   -1.765e+01  8.258e+00  -2.138 0.033243 *  
## as.factor(month)3   -1.495e+01  7.885e+00  -1.897 0.058726 .  
## as.factor(month)4   -2.198e+01  8.438e+00  -2.605 0.009585 ** 
## as.factor(month)5   -1.414e+01  8.765e+00  -1.613 0.107681    
## as.factor(month)6   -8.480e+00  9.448e+00  -0.898 0.370015    
## as.factor(month)7   -1.862e+01  8.991e+00  -2.070 0.039160 *  
## as.factor(month)8   -1.874e+01  8.701e+00  -2.153 0.031991 *  
## as.factor(month)9   -1.693e+01  8.388e+00  -2.019 0.044261 *  
## as.factor(month)10  -1.569e+01  8.059e+00  -1.946 0.052410 .  
## as.factor(month)11  -2.240e+01  8.233e+00  -2.721 0.006845 ** 
## as.factor(month)12  -9.779e+00  4.955e+00  -1.974 0.049232 *  
## trend               -1.186e-02  3.549e-02  -0.334 0.738440    
## as.factor(day)2      5.813e+00  3.374e+00   1.723 0.085782 .  
## as.factor(day)3      3.795e+00  3.444e+00   1.102 0.271298    
## as.factor(day)4      3.540e+00  3.496e+00   1.013 0.311882    
## as.factor(day)5      7.201e+00  3.490e+00   2.063 0.039856 *  
## as.factor(day)6     -1.457e-01  3.481e+00  -0.042 0.966629    
## as.factor(day)7     -4.175e+00  3.372e+00  -1.238 0.216436    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.05 on 344 degrees of freedom
## Multiple R-squared:  0.8269, Adjusted R-squared:  0.8133 
## F-statistic: 60.86 on 27 and 344 DF,  p-value: < 2.2e-16

Up to now, we’ve achieved 0.8159 Adjusted R-Squared value.

We’re adding the Residual attribute:

YuzTemizleyici <- YuzTemizleyici[, `:=`(Residual, 0)]
YuzTemizleyici$Residual[1] = NA
YuzTemizleyici$Residual[2:372] <- residuals(lmYuzTemizleyici)[1:371]
lmYuzTemizleyici <- lm(log_sold ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_basket + category_favored + 
    category_brand_sold + ty_visits + month + trend + as.factor(day) + 
    Residual, data = YuzTemizleyici)

summary(lmYuzTemizleyici)
## 
## Call:
## lm(formula = log_sold ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_basket + category_favored + 
##     category_brand_sold + ty_visits + month + trend + as.factor(day) + 
##     Residual, data = YuzTemizleyici)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82704 -0.15155  0.00492  0.17851  0.80531 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.294e+00  4.125e-01  15.257  < 2e-16 ***
## price               -3.598e-02  4.627e-03  -7.778 8.25e-14 ***
## visit_count          1.597e-04  3.670e-05   4.351 1.77e-05 ***
## basket_count         1.061e-03  1.608e-04   6.599 1.52e-10 ***
## favored_count       -2.680e-04  7.007e-05  -3.824 0.000155 ***
## category_sold       -3.528e-04  5.636e-05  -6.260 1.12e-09 ***
## category_basket     -7.311e-06  8.086e-07  -9.041  < 2e-16 ***
## category_favored     2.581e-05  2.877e-06   8.972  < 2e-16 ***
## category_brand_sold  1.174e-05  2.286e-06   5.137 4.63e-07 ***
## ty_visits            7.016e-09  1.556e-09   4.509 8.89e-06 ***
## month               -3.368e-02  6.101e-03  -5.520 6.60e-08 ***
## trend               -7.851e-04  3.211e-04  -2.445 0.014980 *  
## as.factor(day)2      1.027e-01  5.198e-02   1.976 0.048897 *  
## as.factor(day)3      8.666e-02  5.232e-02   1.656 0.098567 .  
## as.factor(day)4      7.739e-02  5.259e-02   1.472 0.142027    
## as.factor(day)5      1.311e-01  5.236e-02   2.503 0.012753 *  
## as.factor(day)6      1.623e-02  5.241e-02   0.310 0.756969    
## as.factor(day)7     -4.111e-02  5.170e-02  -0.795 0.427092    
## Residual             3.120e-03  8.391e-04   3.719 0.000233 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.264 on 352 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8121, Adjusted R-squared:  0.8025 
## F-statistic: 84.53 on 18 and 352 DF,  p-value: < 2.2e-16

We’re adding the lag_1 attribute:

YuzTemizleyici[, `:=`(lag_1, NA)]
YuzTemizleyici$lag_1[2:372] <- YuzTemizleyici$sold_count[1:371]

lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count + 
    favored_count + category_sold + category_basket + category_favored + 
    category_brand_sold + ty_visits + as.factor(month) + trend + 
    as.factor(day) + Residual + lag_1, data = YuzTemizleyici)
summary(lmYuzTemizleyici)
## 
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count + 
##     favored_count + category_sold + category_basket + category_favored + 
##     category_brand_sold + ty_visits + as.factor(month) + trend + 
##     as.factor(day) + Residual + lag_1, data = YuzTemizleyici)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.073  -8.805  -0.641   7.333  75.074 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.189e+02  3.258e+01   6.718 7.72e-11 ***
## price               -2.495e+00  3.524e-01  -7.082 8.16e-12 ***
## visit_count          1.300e-02  2.766e-03   4.701 3.77e-06 ***
## basket_count         8.190e-02  1.191e-02   6.879 2.89e-11 ***
## favored_count       -1.681e-02  4.686e-03  -3.588 0.000382 ***
## category_sold       -2.576e-02  3.845e-03  -6.701 8.58e-11 ***
## category_basket     -3.742e-04  7.398e-05  -5.059 6.92e-07 ***
## category_favored     1.883e-03  2.068e-04   9.103  < 2e-16 ***
## category_brand_sold  4.297e-04  2.547e-04   1.687 0.092460 .  
## ty_visits            3.626e-07  1.172e-07   3.094 0.002138 ** 
## as.factor(month)2   -1.724e+01  8.230e+00  -2.094 0.036968 *  
## as.factor(month)3   -1.493e+01  7.903e+00  -1.889 0.059793 .  
## as.factor(month)4   -2.166e+01  8.438e+00  -2.567 0.010692 *  
## as.factor(month)5   -1.383e+01  8.654e+00  -1.598 0.110888    
## as.factor(month)6   -8.408e+00  9.316e+00  -0.903 0.367407    
## as.factor(month)7   -1.969e+01  8.936e+00  -2.204 0.028203 *  
## as.factor(month)8   -2.013e+01  8.647e+00  -2.328 0.020509 *  
## as.factor(month)9   -1.829e+01  8.318e+00  -2.199 0.028548 *  
## as.factor(month)10  -1.729e+01  7.996e+00  -2.162 0.031286 *  
## as.factor(month)11  -2.415e+01  8.159e+00  -2.959 0.003297 ** 
## as.factor(month)12  -1.072e+01  4.928e+00  -2.175 0.030330 *  
## trend               -2.156e-02  3.502e-02  -0.616 0.538468    
## as.factor(day)2      6.390e+00  3.368e+00   1.897 0.058626 .  
## as.factor(day)3      3.865e+00  3.392e+00   1.139 0.255316    
## as.factor(day)4      3.539e+00  3.440e+00   1.029 0.304385    
## as.factor(day)5      6.490e+00  3.446e+00   1.883 0.060527 .  
## as.factor(day)6     -1.152e+00  3.452e+00  -0.334 0.738905    
## as.factor(day)7     -4.801e+00  3.332e+00  -1.441 0.150577    
## Residual             2.675e-01  7.360e-02   3.634 0.000322 ***
## lag_1               -8.671e-02  5.044e-02  -1.719 0.086520 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.77 on 341 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8338, Adjusted R-squared:  0.8197 
## F-statistic: 59.01 on 29 and 341 DF,  p-value: < 2.2e-16
AIC(lmYuzTemizleyici)
## [1] 3175.888
# this is the best model I can achieve.

With the 0.8224 Adjusted R-squared value, this is the best model we achieved.

checkresiduals(lmYuzTemizleyici)

## 
##  Breusch-Godfrey test for serial correlation of order up to 33
## 
## data:  Residuals
## LM test = 33.684, df = 33, p-value = 0.4342

Residuals seem have 0 mean, although the variance increases at some points. There aren’t any significant autocorrelation and residuals looks like normally distributed.

##Decomposing the Data

# time series and arima
tsYuzTemizleyici <- ts(YuzTemizleyici$log_sold, frequency = 7)
ts.plot(tsYuzTemizleyici)

finaldecompYuzTemizleyici <- decompose(tsYuzTemizleyici, type = "additive")
plot(finaldecompYuzTemizleyici)

auto.arima(finaldecompYuzTemizleyici$random, seasonal = FALSE, 
    trace = TRUE)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)           with non-zero mean : 29.53309
##  ARIMA(0,0,0)           with non-zero mean : 187.0031
##  ARIMA(1,0,0)           with non-zero mean : 173.1295
##  ARIMA(0,0,1)           with non-zero mean : 165.6274
##  ARIMA(0,0,0)           with zero mean     : 184.9823
##  ARIMA(1,0,2)           with non-zero mean : 73.04316
##  ARIMA(2,0,1)           with non-zero mean : 36.10984
##  ARIMA(3,0,2)           with non-zero mean : Inf
##  ARIMA(2,0,3)           with non-zero mean : 30.68142
##  ARIMA(1,0,1)           with non-zero mean : 167.7132
##  ARIMA(1,0,3)           with non-zero mean : Inf
##  ARIMA(3,0,1)           with non-zero mean : 28.69084
##  ARIMA(3,0,0)           with non-zero mean : 117.2414
##  ARIMA(4,0,1)           with non-zero mean : 54.01181
##  ARIMA(2,0,0)           with non-zero mean : 146.9046
##  ARIMA(4,0,0)           with non-zero mean : 108.0943
##  ARIMA(4,0,2)           with non-zero mean : 32.36692
##  ARIMA(3,0,1)           with zero mean     : 27.27624
##  ARIMA(2,0,1)           with zero mean     : 34.5864
##  ARIMA(3,0,0)           with zero mean     : 115.1856
##  ARIMA(4,0,1)           with zero mean     : 52.33744
##  ARIMA(3,0,2)           with zero mean     : Inf
##  ARIMA(2,0,0)           with zero mean     : 144.8623
##  ARIMA(2,0,2)           with zero mean     : 27.9751
##  ARIMA(4,0,0)           with zero mean     : 106.0316
##  ARIMA(4,0,2)           with zero mean     : 30.83249
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(3,0,1)           with zero mean     : Inf
##  ARIMA(2,0,2)           with zero mean     : Inf
##  ARIMA(3,0,1)           with non-zero mean : Inf
##  ARIMA(2,0,2)           with non-zero mean : Inf
##  ARIMA(2,0,3)           with non-zero mean : Inf
##  ARIMA(4,0,2)           with zero mean     : Inf
##  ARIMA(4,0,2)           with non-zero mean : Inf
##  ARIMA(2,0,1)           with zero mean     : Inf
##  ARIMA(2,0,1)           with non-zero mean : Inf
##  ARIMA(4,0,1)           with zero mean     : Inf
##  ARIMA(4,0,1)           with non-zero mean : Inf
##  ARIMA(1,0,2)           with non-zero mean : Inf
##  ARIMA(4,0,0)           with zero mean     : 106.9794
## 
##  Best model: ARIMA(4,0,0)           with zero mean
## Series: finaldecompYuzTemizleyici$random 
## ARIMA(4,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ar3      ar4
##       0.1337  -0.2320  -0.2587  -0.1645
## s.e.  0.0517   0.0503   0.0503   0.0517
## 
## sigma^2 estimated as 0.077:  log likelihood=-48.41
## AIC=106.81   AICc=106.98   BIC=126.33
YTARIMAmodel2 <- arima(finaldecompYuzTemizleyici$random, order = c(3, 
    0, 1))
YTARIMAmodel2
## 
## Call:
## arima(x = finaldecompYuzTemizleyici$random, order = c(3, 0, 1))
## 
## Coefficients:
##          ar1      ar2      ar3      ma1  intercept
##       0.8616  -0.3719  -0.1223  -1.0000      0e+00
## s.e.  0.0519   0.0659   0.0519   0.0069      2e-04
## 
## sigma^2 estimated as 0.05866:  log likelihood = -3.38,  aic = 18.76
AIC(YTARIMAmodel2)
## [1] 18.76292

##ARIMA Models for Attributes

We’ve created ARIMA models for the attributes in the our final regression model. In order to predict sold quantity, we have to predict attributes first.

ARIMA model for Price

price_ts <- ts(YuzTemizleyici$price, start = as.Date("2021-05-04"), 
    end = as.Date("2021-05-31"), frequency = 7)
price_dec <- decompose(x = price_ts, type = "additive")
price_model = auto.arima(price_dec$random)
AIC(price_model)
## [1] 767.0668
price_model_forecast <- predict(price_model, n.ahead = 1)$pred
seasonality = price_dec$seasonal[1:1]
last_trend_value <- tail(price_dec$trend[!is.na(price_dec$trend)], 
    1)
price_model_forecast = price_model_forecast + last_trend_value + 
    seasonality
price_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 66.95522

ARIMA model for visit_count

visit_count_ts <- ts(YuzTemizleyici$visit_count, start = as.Date("2021-01-28"), 
    end = as.Date("2021-05-31"), frequency = 7)
visit_count_dec <- decompose(x = visit_count_ts, type = "additive")
visit_count_model = auto.arima(visit_count_dec$random)
AIC(visit_count_model)
## [1] 12932.84
visit_count_model_forecast <- predict(visit_count_model, n.ahead = 1)$pred
seasonality = visit_count_dec$seasonal[1:1]
last_trend_value <- tail(visit_count_dec$trend[!is.na(visit_count_dec$trend)], 
    1)
visit_count_model_forecast = visit_count_model_forecast + last_trend_value + 
    seasonality
visit_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 4286.376

ARIMA model for basket_count

basket_count_ts <- ts(YuzTemizleyici$basket_count, start = as.Date("2021-04-28"), 
    end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 2747.135
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 1)$pred
seasonality = basket_count_dec$seasonal[1:1]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)], 
    1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value + 
    seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 178.9897

ARIMA model for favored_count

favored_count_ts <- ts(YuzTemizleyici$favored_count, start = as.Date("2021-01-28"), 
    end = as.Date("2021-05-31"), frequency = 7)
favored_count_dec <- decompose(x = favored_count_ts, type = "additive")
favored_count_model = auto.arima(favored_count_dec$random)
AIC(favored_count_model)
## [1] 11869.29
favored_count_model_forecast <- predict(favored_count_model, 
    n.ahead = 1)$pred
seasonality = favored_count_dec$seasonal[1:1]
last_trend_value <- tail(favored_count_dec$trend[!is.na(favored_count_dec$trend)], 
    1)
favored_count_model_forecast = favored_count_model_forecast + 
    last_trend_value + seasonality
favored_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 719.3471

ARIMA model for category_sold

category_sold_ts <- ts(YuzTemizleyici$category_sold, start = as.Date("2021-04-28"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random)
AIC(category_sold_model)
## [1] 3125.509
category_sold_model_forecast <- predict(category_sold_model, 
    n.ahead = 1)$pred
seasonality = category_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)], 
    1)
category_sold_model_forecast = category_sold_model_forecast + 
    last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 324.108

ARIMA model for category_basket

category_basket_ts <- ts(YuzTemizleyici$category_basket, start = as.Date("2021-01-28"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_basket_dec <- decompose(x = category_basket_ts, type = "additive")
category_basket_model = auto.arima(category_basket_dec$random)
AIC(category_basket_model)
## [1] 19434.99
category_basket_model_forecast <- predict(category_basket_model, 
    n.ahead = 1)$pred
seasonality = category_basket_dec$seasonal[1:1]
last_trend_value <- tail(category_basket_dec$trend[!is.na(category_basket_dec$trend)], 
    1)
category_basket_model_forecast = category_basket_model_forecast + 
    last_trend_value + seasonality
category_basket_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 262743.1

ARIMA model for category_favored

category_favored_ts <- ts(YuzTemizleyici$category_favored, start = as.Date("2021-04-28"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_favored_dec <- decompose(x = category_favored_ts, type = "additive")
category_favored_model = auto.arima(category_favored_dec$random)
AIC(category_favored_model)
## [1] 4432.176
category_favored_model_forecast <- predict(category_favored_model, 
    n.ahead = 1)$pred
seasonality = category_favored_dec$seasonal[1:1]
last_trend_value <- tail(category_favored_dec$trend[!is.na(category_favored_dec$trend)], 
    1)
category_favored_model_forecast = category_favored_model_forecast + 
    last_trend_value + seasonality
category_favored_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 13984.14

ARIMA model for category_brand_sold

category_brand_sold_ts <- ts(YuzTemizleyici$category_brand_sold, 
    start = as.Date("2021-01-28"), end = as.Date("2021-05-31"), 
    frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts, 
    type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random)
AIC(category_brand_sold_model)
## [1] 16680.66
category_brand_sold_model_forecast <- predict(category_brand_sold_model, 
    n.ahead = 1)$pred
seasonality = category_brand_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)], 
    1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast + 
    last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 38168.86

ARIMA model for ty_visits

ty_visits_ts <- ts(YuzTemizleyici$ty_visits, start = as.Date("2021-01-28"), 
    end = as.Date("2021-05-31"), frequency = 7)
ty_visits_dec <- decompose(x = ty_visits_ts, type = "additive")
ty_visits_model = auto.arima(ty_visits_dec$random)
AIC(ty_visits_model)
## [1] 29881.78
ty_visits_model_forecast <- predict(ty_visits_model, n.ahead = 1)$pred
seasonality = ty_visits_dec$seasonal[1:1]
last_trend_value <- tail(ty_visits_dec$trend[!is.na(ty_visits_dec$trend)], 
    1)
ty_visits_model_forecast = ty_visits_model_forecast + last_trend_value + 
    seasonality
ty_visits_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 116744719

ARIMA model for Residuals

Residual_ts <- ts(YuzTemizleyici$Residual, start = as.Date("2021-04-28"), 
    end = as.Date("2021-05-31"), frequency = 7)
Residual_dec <- decompose(x = Residual_ts, type = "additive")
Residual_model = auto.arima(Residual_dec$random)
AIC(Residual_model)
## [1] 1908.616
Residual_model_forecast <- predict(Residual_model, n.ahead = 1)$pred
seasonality = Residual_dec$seasonal[1:1]
last_trend_value <- tail(Residual_dec$trend[!is.na(Residual_dec$trend)], 
    1)
Residual_model_forecast = Residual_model_forecast + last_trend_value + 
    seasonality
Residual_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 3.991736

ARIMA model for lag_1

lag_1_ts <- ts(YuzTemizleyici$lag_1, start = as.Date("2021-04-28"), 
    end = as.Date("2021-05-31"), frequency = 7)
lag_1_dec <- decompose(x = lag_1_ts, type = "additive")
lag_1_model = auto.arima(lag_1_dec$random)
AIC(lag_1_model)
## [1] 2053.699
lag_1_model_forecast <- predict(lag_1_model, n.ahead = 1)$pred
seasonality = lag_1_dec$seasonal[1:1]
last_trend_value <- tail(lag_1_dec$trend[!is.na(lag_1_dec$trend)], 
    1)
lag_1_model_forecast = lag_1_model_forecast + last_trend_value + 
    seasonality
lag_1_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 48.06433

We’ve forecasted the attributes until now.

##Prediction

predict(lmYuzTemizleyici, data.frame(price = price_model_forecast, 
    visit_count = visit_count_model_forecast, basket_count = basket_count_model_forecast, 
    favored_count = favored_count_model_forecast, category_sold = category_sold_model_forecast, 
    category_basket = category_basket_model_forecast, category_favored = category_favored_model_forecast, 
    category_brand_sold = category_brand_sold_model_forecast, 
    ty_visits = ty_visits_model_forecast, month = as.factor(6), 
    trend = 0, day = as.factor(3), Residual = Residual_model_forecast, 
    lag_1 = lag_1_model_forecast))
##        1 
## 80.82597

Our prediction for YuzTemizleyici sold quantity is 80.90704

Including Plots

You can also embed plots, for example:

DisFircasi <- projectdata[product_content_id == 32939029]
(DisFircasi)
##      event_date product_content_id    price sold_count visit_count basket_count
##   1: 2021-05-31           32939029 141.2127        125        5301          608
##   2: 2021-05-30           32939029 141.8578        139        5049          604
##   3: 2021-05-29           32939029 142.1202        133        5383          615
##   4: 2021-05-28           32939029 137.6755        182        5853          739
##   5: 2021-05-27           32939029 136.0229        235        6720          911
##  ---                                                                           
## 368: 2020-05-29           32939029 126.1038         52           0          257
## 369: 2020-05-28           32939029 115.1035         84           0          370
## 370: 2020-05-27           32939029 114.1078        103           0          398
## 371: 2020-05-26           32939029 115.8495        101           0          411
## 372: 2020-05-25           32939029 112.9000         74           0          323
##      favored_count category_sold category_visits category_basket
##   1:           698           930           41973            3911
##   2:           609           911           43021            3801
##   3:           624           895           40310            3611
##   4:           608           807             861           34608
##   5:           869           800             853           36514
##  ---                                                            
## 368:             0           810             851               0
## 369:             0           927             978               0
## 370:             0          1071            1125               0
## 371:             0          1351            1419               0
## 372:             0          1193            1231               0
##      category_favored category_brand_sold ty_visits
##   1:             5791                 875 125439876
##   2:             4602                 866 131821083
##   3:             3946                 857 129670029
##   4:             3214                3370 103514886
##   5:             3377                3946 107391579
##  ---                                               
## 368:             2878                   0         1
## 369:             3336                   0         1
## 370:             3876                   0         1
## 371:             4647                   0         1
## 372:             4132                   0         1
summary(DisFircasi)
##    event_date         product_content_id     price         sold_count    
##  Min.   :2020-05-25   Min.   :32939029   Min.   :110.1   Min.   :  0.00  
##  1st Qu.:2020-08-25   1st Qu.:32939029   1st Qu.:128.9   1st Qu.: 18.00  
##  Median :2020-11-26   Median :32939029   Median :135.4   Median : 52.00  
##  Mean   :2020-11-26   Mean   :32939029   Mean   :134.3   Mean   : 92.21  
##  3rd Qu.:2021-02-27   3rd Qu.:32939029   3rd Qu.:140.0   3rd Qu.:139.50  
##  Max.   :2021-05-31   Max.   :32939029   Max.   :165.9   Max.   :513.00  
##                                          NA's   :9                       
##   visit_count     basket_count     favored_count    category_sold 
##  Min.   :    0   Min.   :   0.00   Min.   :   0.0   Min.   : 321  
##  1st Qu.:    0   1st Qu.:  83.75   1st Qu.:   0.0   1st Qu.: 598  
##  Median :    0   Median : 216.50   Median :   0.0   Median : 806  
##  Mean   : 2119   Mean   : 392.81   Mean   : 343.9   Mean   :1024  
##  3rd Qu.: 4097   3rd Qu.: 578.00   3rd Qu.: 572.2   3rd Qu.:1116  
##  Max.   :15725   Max.   :2249.00   Max.   :2696.0   Max.   :5557  
##                                                                   
##  category_visits   category_basket  category_favored category_brand_sold
##  Min.   :  346.0   Min.   :     0   Min.   : 1242    Min.   :    0      
##  1st Qu.:  640.0   1st Qu.:     0   1st Qu.: 2426    1st Qu.:    0      
##  Median :  854.5   Median :     0   Median : 3300    Median :    0      
##  Mean   : 1406.9   Mean   : 19977   Mean   : 4266    Mean   : 3187      
##  3rd Qu.: 1185.0   3rd Qu.: 44483   3rd Qu.: 5050    3rd Qu.: 5428      
##  Max.   :43021.0   Max.   :281022   Max.   :44445    Max.   :28944      
##                                                                         
##    ty_visits        
##  Min.   :        1  
##  1st Qu.:        1  
##  Median :        1  
##  Mean   : 39145174  
##  3rd Qu.: 97619487  
##  Max.   :178545693  
## 
ts.plot(DisFircasi[, c("sold_count")], main = "Daily Sales Quantity", 
    xlab = "Time", ylab = "Sales Quantity")

In the plot we can the increasing trend in the sales quantities. Since there is a big shift in the number of sales, we will only use the values between 1:150

DisFircasi <- DisFircasi[1:150]
ts.plot(DisFircasi[, c("sold_count")], main = "Daily Sales Quantity", 
    xlab = "Time", ylab = "Sales Quantity")

We are adding month, day, trend attributes to the data in order to better check.

DisFircasi[, `:=`(month, month(event_date))]
DisFircasi[, `:=`(day, lubridate::wday(event_date))]
DisFircasi[, `:=`(trend, 1:.N)]

##Regression Model

After trying lots of models, the best regression model I achived is like that with 0.8655 adjusted R squared value.

lmDisFircasi <- lm(sold_count ~ basket_count + favored_count + 
    category_sold + category_favored + category_brand_sold, data = DisFircasi)
summary(lmDisFircasi)
## 
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold + 
##     category_favored + category_brand_sold, data = DisFircasi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -146.90  -21.14   -5.19   18.23  104.94 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         19.789120   8.792242   2.251  0.02592 *  
## basket_count         0.240600   0.016440  14.635  < 2e-16 ***
## favored_count       -0.042636   0.013778  -3.094  0.00237 ** 
## category_sold        0.032223   0.013276   2.427  0.01645 *  
## category_favored    -0.002403   0.001639  -1.466  0.14479    
## category_brand_sold -0.002853   0.002377  -1.201  0.23185    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.81 on 144 degrees of freedom
## Multiple R-squared:   0.87,  Adjusted R-squared:  0.8655 
## F-statistic: 192.7 on 5 and 144 DF,  p-value: < 2.2e-16
# this is the best model I can achieve.

##Decomposing

# time series and arima
tsDisFircasi <- ts(DisFircasi$sold_count, frequency = 7)



decompDisFircasi <- decompose(tsDisFircasi, type = "additive")


finalARIMAmodel3 <- arima(decompDisFircasi$random, order = c(1, 
    0, 5))
AIC(finalARIMAmodel3)
## [1] 1420.678

##ARIMA Models of Attributes

ARIMA model for Basket_count

basket_count_ts <- ts(DisFircasi$basket_count, start = as.Date("2021-04-26"), 
    end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 3244.35
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 1)$pred
seasonality = basket_count_dec$seasonal[1:1]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)], 
    1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value + 
    seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 654.4589

ARIMA model for Favored_count

favored_count_ts <- ts(DisFircasi$favored_count, start = as.Date("2021-04-26"), 
    end = as.Date("2021-05-31"), frequency = 7)
favored_count_dec <- decompose(x = favored_count_ts, type = "additive")
favored_count_model = auto.arima(favored_count_dec$random)
AIC(favored_count_model)
## [1] 3327.247
favored_count_model_forecast <- predict(favored_count_model, 
    n.ahead = 1)$pred
seasonality = favored_count_dec$seasonal[1:1]
last_trend_value <- tail(favored_count_dec$trend[!is.na(favored_count_dec$trend)], 
    1)
favored_count_model_forecast = favored_count_model_forecast + 
    last_trend_value + seasonality
favored_count_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 805.653

ARIMA model for Category_sold

category_sold_ts <- ts(DisFircasi$category_sold, start = as.Date("2021-04-26"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random)
AIC(category_sold_model)
## [1] 3265.814
category_sold_model_forecast <- predict(category_sold_model, 
    n.ahead = 1)$pred
seasonality = category_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)], 
    1)
category_sold_model_forecast = category_sold_model_forecast + 
    last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 776.2867

ARIMA model for Category_favored

category_favored_ts <- ts(DisFircasi$category_favored, start = as.Date("2021-04-26"), 
    end = as.Date("2021-05-31"), frequency = 7)
category_favored_dec <- decompose(x = category_favored_ts, type = "additive")
category_favored_model = auto.arima(category_favored_dec$random)
AIC(category_favored_model)
## [1] 4461.847
category_favored_model_forecast <- predict(category_favored_model, 
    n.ahead = 1)$pred
seasonality = category_favored_dec$seasonal[1:1]
last_trend_value <- tail(category_favored_dec$trend[!is.na(category_favored_dec$trend)], 
    1)
category_favored_model_forecast = category_favored_model_forecast + 
    last_trend_value + seasonality
category_favored_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 4024.603

ARIMA model for Category_brand_sold

category_brand_sold_ts <- ts(DisFircasi$category_brand_sold, 
    start = as.Date("2021-04-26"), end = as.Date("2021-05-31"), 
    frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts, 
    type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random)
AIC(category_brand_sold_model)
## [1] 4256.853
category_brand_sold_model_forecast <- predict(category_brand_sold_model, 
    n.ahead = 1)$pred
seasonality = category_brand_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)], 
    1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast + 
    last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2) 
## End = c(18778, 2) 
## Frequency = 7 
## [1] 5161.364

We’ve forecasted the attributes until now.

##Prediction

predict(lmDisFircasi, data.frame(basket_count = basket_count_model_forecast, 
    favored_count = favored_count_model_forecast, category_sold = category_sold_model_forecast, 
    category_favored = category_favored_model_forecast, category_brand_sold = category_brand_sold_model_forecast))
##        1 
## 143.5191